Upgraded some embedded dependencies to be ready for Python 3.10. This doesn't mean that it's fully supported right now.

pull/1632/head
morpheus65535 2 years ago
parent 2d214bfbd5
commit 402c82d84f

@ -20,8 +20,8 @@ def check_python_version():
print("Python " + minimum_py3_str + " or greater required. "
"Current version is " + platform.python_version() + ". Please upgrade Python.")
sys.exit(1)
elif int(python_version[0]) == 3 and int(python_version[1]) == 9:
print("Python 3.9.x is unsupported. Current version is " + platform.python_version() +
elif int(python_version[0]) == 3 and int(python_version[1]) > 8:
print("Python version greater than 3.8.x is unsupported. Current version is " + platform.python_version() +
". Keep in mind that even if it works, you're on your own.")
elif (int(python_version[0]) == minimum_py3_tuple[0] and int(python_version[1]) < minimum_py3_tuple[1]) or \
(int(python_version[0]) != minimum_py3_tuple[0]):

@ -4,12 +4,6 @@
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
__title__ = 'babelfish'
__version__ = '0.5.5-dev'
__author__ = 'Antoine Bertin'
__license__ = 'BSD'
__copyright__ = 'Copyright 2015 the BabelFish authors'
import sys
if sys.version_info[0] >= 3:

@ -2,17 +2,22 @@
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
import collections
from pkg_resources import iter_entry_points, EntryPoint
from ..exceptions import LanguageConvertError, LanguageReverseError
try:
# Python 3.3+
from collections.abc import Mapping, MutableMapping
except ImportError:
from collections import Mapping, MutableMapping
# from https://github.com/kennethreitz/requests/blob/master/requests/structures.py
class CaseInsensitiveDict(collections.MutableMapping):
class CaseInsensitiveDict(MutableMapping):
"""A case-insensitive ``dict``-like object.
Implements all methods and operations of
``collections.MutableMapping`` as well as dict's ``copy``. Also
``collections.abc.MutableMapping`` as well as dict's ``copy``. Also
provides ``lower_items``.
All keys are expected to be strings. The structure remembers the
@ -63,7 +68,7 @@ class CaseInsensitiveDict(collections.MutableMapping):
)
def __eq__(self, other):
if isinstance(other, collections.Mapping):
if isinstance(other, Mapping):
other = CaseInsensitiveDict(other)
else:
return NotImplemented

@ -14,9 +14,9 @@ class OpenSubtitlesConverter(LanguageReverseConverter):
def __init__(self):
self.alpha3b_converter = language_converters['alpha3b']
self.alpha2_converter = language_converters['alpha2']
self.to_opensubtitles = {('por', 'BR'): 'pob', ('gre', None): 'ell', ('srp', None): 'scc', ('srp', 'ME'): 'mne'}
self.to_opensubtitles = {('por', 'BR'): 'pob', ('gre', None): 'ell', ('srp', None): 'scc', ('srp', 'ME'): 'mne', ('chi', 'TW'): 'zht'}
self.from_opensubtitles = CaseInsensitiveDict({'pob': ('por', 'BR'), 'pb': ('por', 'BR'), 'ell': ('ell', None),
'scc': ('srp', None), 'mne': ('srp', 'ME')})
'scc': ('srp', None), 'mne': ('srp', 'ME'), 'zht': ('zho', 'TW')})
self.codes = (self.alpha2_converter.codes | self.alpha3b_converter.codes | set(self.from_opensubtitles.keys()))
def convert(self, alpha3, country=None, script=None):

@ -4,6 +4,7 @@
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from functools import partial
from pkg_resources import resource_stream # @UnresolvedImport

@ -1,45 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
import os.path
import tempfile
import zipfile
import requests
DATA_DIR = os.path.dirname(__file__)
# iso-3166-1.txt
print('Downloading ISO-3166-1 standard (ISO country codes)...')
with open(os.path.join(DATA_DIR, 'iso-3166-1.txt'), 'w') as f:
r = requests.get('http://www.iso.org/iso/home/standards/country_codes/country_names_and_code_elements_txt.htm')
f.write(r.content.strip())
# iso-639-3.tab
print('Downloading ISO-639-3 standard (ISO language codes)...')
with tempfile.TemporaryFile() as f:
r = requests.get('http://www-01.sil.org/iso639-3/iso-639-3_Code_Tables_20130531.zip')
f.write(r.content)
with zipfile.ZipFile(f) as z:
z.extract('iso-639-3.tab', DATA_DIR)
# iso-15924
print('Downloading ISO-15924 standard (ISO script codes)...')
with tempfile.TemporaryFile() as f:
r = requests.get('http://www.unicode.org/iso15924/iso15924.txt.zip')
f.write(r.content)
with zipfile.ZipFile(f) as z:
z.extract('iso15924-utf8-20131012.txt', DATA_DIR)
# opensubtitles supported languages
print('Downloading OpenSubtitles supported languages...')
with open(os.path.join(DATA_DIR, 'opensubtitles_languages.txt'), 'w') as f:
r = requests.get('http://www.opensubtitles.org/addons/export_languages.php')
f.write(r.content)
print('Done!')

@ -4,6 +4,7 @@
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from functools import partial
from pkg_resources import resource_stream # @UnresolvedImport

@ -4,6 +4,7 @@
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from pkg_resources import resource_stream # @UnresolvedImport
from . import basestr

@ -1,377 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
import re
import sys
import pickle
from unittest import TestCase, TestSuite, TestLoader, TextTestRunner
from pkg_resources import resource_stream # @UnresolvedImport
from babelfish import (LANGUAGES, Language, Country, Script, language_converters, country_converters,
LanguageReverseConverter, LanguageConvertError, LanguageReverseError, CountryReverseError)
if sys.version_info[:2] <= (2, 6):
_MAX_LENGTH = 80
def safe_repr(obj, short=False):
try:
result = repr(obj)
except Exception:
result = object.__repr__(obj)
if not short or len(result) < _MAX_LENGTH:
return result
return result[:_MAX_LENGTH] + ' [truncated]...'
class _AssertRaisesContext(object):
"""A context manager used to implement TestCase.assertRaises* methods."""
def __init__(self, expected, test_case, expected_regexp=None):
self.expected = expected
self.failureException = test_case.failureException
self.expected_regexp = expected_regexp
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, tb):
if exc_type is None:
try:
exc_name = self.expected.__name__
except AttributeError:
exc_name = str(self.expected)
raise self.failureException(
"{0} not raised".format(exc_name))
if not issubclass(exc_type, self.expected):
# let unexpected exceptions pass through
return False
self.exception = exc_value # store for later retrieval
if self.expected_regexp is None:
return True
expected_regexp = self.expected_regexp
if isinstance(expected_regexp, basestring):
expected_regexp = re.compile(expected_regexp)
if not expected_regexp.search(str(exc_value)):
raise self.failureException('"%s" does not match "%s"' %
(expected_regexp.pattern, str(exc_value)))
return True
class _Py26FixTestCase(object):
def assertIsNone(self, obj, msg=None):
"""Same as self.assertTrue(obj is None), with a nicer default message."""
if obj is not None:
standardMsg = '%s is not None' % (safe_repr(obj),)
self.fail(self._formatMessage(msg, standardMsg))
def assertIsNotNone(self, obj, msg=None):
"""Included for symmetry with assertIsNone."""
if obj is None:
standardMsg = 'unexpectedly None'
self.fail(self._formatMessage(msg, standardMsg))
def assertIn(self, member, container, msg=None):
"""Just like self.assertTrue(a in b), but with a nicer default message."""
if member not in container:
standardMsg = '%s not found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))
def assertNotIn(self, member, container, msg=None):
"""Just like self.assertTrue(a not in b), but with a nicer default message."""
if member in container:
standardMsg = '%s unexpectedly found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))
def assertIs(self, expr1, expr2, msg=None):
"""Just like self.assertTrue(a is b), but with a nicer default message."""
if expr1 is not expr2:
standardMsg = '%s is not %s' % (safe_repr(expr1),
safe_repr(expr2))
self.fail(self._formatMessage(msg, standardMsg))
def assertIsNot(self, expr1, expr2, msg=None):
"""Just like self.assertTrue(a is not b), but with a nicer default message."""
if expr1 is expr2:
standardMsg = 'unexpectedly identical: %s' % (safe_repr(expr1),)
self.fail(self._formatMessage(msg, standardMsg))
else:
class _Py26FixTestCase(object):
pass
class TestScript(TestCase, _Py26FixTestCase):
def test_wrong_script(self):
self.assertRaises(ValueError, lambda: Script('Azer'))
def test_eq(self):
self.assertEqual(Script('Latn'), Script('Latn'))
def test_ne(self):
self.assertNotEqual(Script('Cyrl'), Script('Latn'))
def test_hash(self):
self.assertEqual(hash(Script('Hira')), hash('Hira'))
def test_pickle(self):
self.assertEqual(pickle.loads(pickle.dumps(Script('Latn'))), Script('Latn'))
class TestCountry(TestCase, _Py26FixTestCase):
def test_wrong_country(self):
self.assertRaises(ValueError, lambda: Country('ZZ'))
def test_eq(self):
self.assertEqual(Country('US'), Country('US'))
def test_ne(self):
self.assertNotEqual(Country('GB'), Country('US'))
self.assertIsNotNone(Country('US'))
def test_hash(self):
self.assertEqual(hash(Country('US')), hash('US'))
def test_pickle(self):
for country in [Country('GB'), Country('US')]:
self.assertEqual(pickle.loads(pickle.dumps(country)), country)
def test_converter_name(self):
self.assertEqual(Country('US').name, 'UNITED STATES')
self.assertEqual(Country.fromname('UNITED STATES'), Country('US'))
self.assertEqual(Country.fromcode('UNITED STATES', 'name'), Country('US'))
self.assertRaises(CountryReverseError, lambda: Country.fromname('ZZZZZ'))
self.assertEqual(len(country_converters['name'].codes), 249)
class TestLanguage(TestCase, _Py26FixTestCase):
def test_languages(self):
self.assertEqual(len(LANGUAGES), 7874)
def test_wrong_language(self):
self.assertRaises(ValueError, lambda: Language('zzz'))
def test_unknown_language(self):
self.assertEqual(Language('zzzz', unknown='und'), Language('und'))
def test_converter_alpha2(self):
self.assertEqual(Language('eng').alpha2, 'en')
self.assertEqual(Language.fromalpha2('en'), Language('eng'))
self.assertEqual(Language.fromcode('en', 'alpha2'), Language('eng'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha2('zz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha2)
self.assertEqual(len(language_converters['alpha2'].codes), 184)
def test_converter_alpha3b(self):
self.assertEqual(Language('fra').alpha3b, 'fre')
self.assertEqual(Language.fromalpha3b('fre'), Language('fra'))
self.assertEqual(Language.fromcode('fre', 'alpha3b'), Language('fra'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha3b('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha3b)
self.assertEqual(len(language_converters['alpha3b'].codes), 418)
def test_converter_alpha3t(self):
self.assertEqual(Language('fra').alpha3t, 'fra')
self.assertEqual(Language.fromalpha3t('fra'), Language('fra'))
self.assertEqual(Language.fromcode('fra', 'alpha3t'), Language('fra'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha3t('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha3t)
self.assertEqual(len(language_converters['alpha3t'].codes), 418)
def test_converter_name(self):
self.assertEqual(Language('eng').name, 'English')
self.assertEqual(Language.fromname('English'), Language('eng'))
self.assertEqual(Language.fromcode('English', 'name'), Language('eng'))
self.assertRaises(LanguageReverseError, lambda: Language.fromname('Zzzzzzzzz'))
self.assertEqual(len(language_converters['name'].codes), 7874)
def test_converter_scope(self):
self.assertEqual(language_converters['scope'].codes, set(['I', 'S', 'M']))
self.assertEqual(Language('eng').scope, 'individual')
self.assertEqual(Language('und').scope, 'special')
def test_converter_type(self):
self.assertEqual(language_converters['type'].codes, set(['A', 'C', 'E', 'H', 'L', 'S']))
self.assertEqual(Language('eng').type, 'living')
self.assertEqual(Language('und').type, 'special')
def test_converter_opensubtitles(self):
self.assertEqual(Language('fra').opensubtitles, Language('fra').alpha3b)
self.assertEqual(Language('por', 'BR').opensubtitles, 'pob')
self.assertEqual(Language.fromopensubtitles('fre'), Language('fra'))
self.assertEqual(Language.fromopensubtitles('pob'), Language('por', 'BR'))
self.assertEqual(Language.fromopensubtitles('pb'), Language('por', 'BR'))
# Montenegrin is not recognized as an ISO language (yet?) but for now it is
# unofficially accepted as Serbian from Montenegro
self.assertEqual(Language.fromopensubtitles('mne'), Language('srp', 'ME'))
self.assertEqual(Language.fromcode('pob', 'opensubtitles'), Language('por', 'BR'))
self.assertRaises(LanguageReverseError, lambda: Language.fromopensubtitles('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').opensubtitles)
self.assertEqual(len(language_converters['opensubtitles'].codes), 607)
# test with all the LANGUAGES from the opensubtitles api
# downloaded from: http://www.opensubtitles.org/addons/export_languages.php
f = resource_stream('babelfish', 'data/opensubtitles_languages.txt')
f.readline()
for l in f:
idlang, alpha2, _, upload_enabled, web_enabled = l.decode('utf-8').strip().split('\t')
if not int(upload_enabled) and not int(web_enabled):
# do not test LANGUAGES that are too esoteric / not widely available
continue
self.assertEqual(Language.fromopensubtitles(idlang).opensubtitles, idlang)
if alpha2:
self.assertEqual(Language.fromopensubtitles(idlang), Language.fromopensubtitles(alpha2))
f.close()
def test_converter_opensubtitles_codes(self):
for code in language_converters['opensubtitles'].from_opensubtitles.keys():
self.assertIn(code, language_converters['opensubtitles'].codes)
def test_fromietf_country_script(self):
language = Language.fromietf('fra-FR-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_country_no_script(self):
language = Language.fromietf('fra-FR')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertIsNone(language.script)
def test_fromietf_no_country_no_script(self):
language = Language.fromietf('fra-FR')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertIsNone(language.script)
def test_fromietf_no_country_script(self):
language = Language.fromietf('fra-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertIsNone(language.country)
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_alpha2_language(self):
language = Language.fromietf('fr-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertIsNone(language.country)
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_wrong_language(self):
self.assertRaises(ValueError, lambda: Language.fromietf('xyz-FR'))
def test_fromietf_wrong_country(self):
self.assertRaises(ValueError, lambda: Language.fromietf('fra-YZ'))
def test_fromietf_wrong_script(self):
self.assertRaises(ValueError, lambda: Language.fromietf('fra-FR-Wxyz'))
def test_eq(self):
self.assertEqual(Language('eng'), Language('eng'))
def test_ne(self):
self.assertNotEqual(Language('fra'), Language('eng'))
self.assertIsNotNone(Language('fra'))
def test_nonzero(self):
self.assertFalse(bool(Language('und')))
self.assertTrue(bool(Language('eng')))
def test_language_hasattr(self):
self.assertTrue(hasattr(Language('fra'), 'alpha3'))
self.assertTrue(hasattr(Language('fra'), 'alpha2'))
self.assertFalse(hasattr(Language('bej'), 'alpha2'))
def test_country_hasattr(self):
self.assertTrue(hasattr(Country('US'), 'name'))
self.assertTrue(hasattr(Country('FR'), 'alpha2'))
self.assertFalse(hasattr(Country('BE'), 'none'))
def test_country(self):
self.assertEqual(Language('por', 'BR').country, Country('BR'))
self.assertEqual(Language('eng', Country('US')).country, Country('US'))
def test_eq_with_country(self):
self.assertEqual(Language('eng', 'US'), Language('eng', Country('US')))
def test_ne_with_country(self):
self.assertNotEqual(Language('eng', 'US'), Language('eng', Country('GB')))
def test_script(self):
self.assertEqual(Language('srp', script='Latn').script, Script('Latn'))
self.assertEqual(Language('srp', script=Script('Cyrl')).script, Script('Cyrl'))
def test_eq_with_script(self):
self.assertEqual(Language('srp', script='Latn'), Language('srp', script=Script('Latn')))
def test_ne_with_script(self):
self.assertNotEqual(Language('srp', script='Latn'), Language('srp', script=Script('Cyrl')))
def test_eq_with_country_and_script(self):
self.assertEqual(Language('srp', 'SR', 'Latn'), Language('srp', Country('SR'), Script('Latn')))
def test_ne_with_country_and_script(self):
self.assertNotEqual(Language('srp', 'SR', 'Latn'), Language('srp', Country('SR'), Script('Cyrl')))
def test_hash(self):
self.assertEqual(hash(Language('fra')), hash('fr'))
self.assertEqual(hash(Language('ace')), hash('ace'))
self.assertEqual(hash(Language('por', 'BR')), hash('pt-BR'))
self.assertEqual(hash(Language('srp', script='Cyrl')), hash('sr-Cyrl'))
self.assertEqual(hash(Language('eng', 'US', 'Latn')), hash('en-US-Latn'))
def test_pickle(self):
for lang in [Language('fra'),
Language('eng', 'US'),
Language('srp', script='Latn'),
Language('eng', 'US', 'Latn')]:
self.assertEqual(pickle.loads(pickle.dumps(lang)), lang)
def test_str(self):
self.assertEqual(Language.fromietf(str(Language('eng', 'US', 'Latn'))), Language('eng', 'US', 'Latn'))
self.assertEqual(Language.fromietf(str(Language('fra', 'FR'))), Language('fra', 'FR'))
self.assertEqual(Language.fromietf(str(Language('bel'))), Language('bel'))
def test_register_converter(self):
class TestConverter(LanguageReverseConverter):
def __init__(self):
self.to_test = {'fra': 'test1', 'eng': 'test2'}
self.from_test = {'test1': 'fra', 'test2': 'eng'}
def convert(self, alpha3, country=None, script=None):
if alpha3 not in self.to_test:
raise LanguageConvertError(alpha3, country, script)
return self.to_test[alpha3]
def reverse(self, test):
if test not in self.from_test:
raise LanguageReverseError(test)
return (self.from_test[test], None)
language = Language('fra')
self.assertFalse(hasattr(language, 'test'))
language_converters['test'] = TestConverter()
self.assertTrue(hasattr(language, 'test'))
self.assertIn('test', language_converters)
self.assertEqual(Language('fra').test, 'test1')
self.assertEqual(Language.fromtest('test2').alpha3, 'eng')
del language_converters['test']
self.assertNotIn('test', language_converters)
self.assertRaises(KeyError, lambda: Language.fromtest('test1'))
self.assertRaises(AttributeError, lambda: Language('fra').test)
def suite():
suite = TestSuite()
suite.addTest(TestLoader().loadTestsFromTestCase(TestScript))
suite.addTest(TestLoader().loadTestsFromTestCase(TestCountry))
suite.addTest(TestLoader().loadTestsFromTestCase(TestLanguage))
return suite
if __name__ == '__main__':
TextTestRunner().run(suite())

@ -11,10 +11,12 @@ from werkzeug.wrappers import Response as ResponseBase
from flask_restful.utils import http_status_message, unpack, OrderedDict
from flask_restful.representations.json import output_json
import sys
from flask.helpers import _endpoint_from_view_func
from types import MethodType
import operator
from collections import Mapping
try:
from collections.abc import Mapping
except ImportError:
from collections import Mapping
__all__ = ('Api', 'Resource', 'marshal', 'marshal_with', 'marshal_with_field', 'abort')
@ -58,7 +60,7 @@ class Api(object):
to handle 404 errors throughout your app
:param serve_challenge_on_401: Whether to serve a challenge response to
clients on receiving 401. This usually leads to a username/password
popup in web browers.
popup in web browsers.
:param url_part_order: A string that controls the order that the pieces
of the url are concatenated when the full url is constructed. 'b'
is the blueprint (or blueprint registration) prefix, 'a' is the api
@ -153,7 +155,7 @@ class Api(object):
rule = blueprint_setup.url_prefix + rule
options.setdefault('subdomain', blueprint_setup.subdomain)
if endpoint is None:
endpoint = _endpoint_from_view_func(view_func)
endpoint = view_func.__name__
defaults = blueprint_setup.url_defaults
if 'defaults' in options:
defaults = dict(defaults, **options.pop('defaults'))
@ -287,6 +289,13 @@ class Api(object):
headers = Headers()
if isinstance(e, HTTPException):
if e.response is not None:
# If HTTPException is initialized with a response, then return e.get_response().
# This prevents specified error response from being overridden.
# eg. HTTPException(response=Response("Hello World"))
resp = e.get_response()
return resp
code = e.code
default_data = {
'message': getattr(e, 'description', http_status_message(code))

@ -1,3 +1,3 @@
#!/usr/bin/env python
__version__ = '0.3.7'
__version__ = '0.3.9'

@ -1,6 +1,4 @@
from datetime import datetime
from calendar import timegm
import pytz
from decimal import Decimal as MyDecimal, ROUND_HALF_EVEN
from email.utils import formatdate
import six
@ -9,8 +7,7 @@ try:
except ImportError:
# python3
from urllib.parse import urlparse, urlunparse
from flask_restful import inputs, marshal
from flask_restful import marshal
from flask import url_for, request
__all__ = ["String", "FormattedString", "Url", "DateTime", "Float",

@ -269,7 +269,7 @@ def datetime_from_rfc822(datetime_str):
def datetime_from_iso8601(datetime_str):
"""Turns an ISO8601 formatted date into a datetime object.
"""Turns an ISO8601 formatted datetime into a datetime object.
Example::

@ -1,6 +1,9 @@
from copy import deepcopy
import collections
try:
from collections.abc import MutableSequence
except ImportError:
from collections import MutableSequence
from flask import current_app, request
from werkzeug.datastructures import MultiDict, FileStorage
from werkzeug import exceptions
@ -146,7 +149,7 @@ class Argument(object):
except TypeError:
try:
if self.type is decimal.Decimal:
return self.type(str(value), self.name)
return self.type(str(value))
else:
return self.type(value, self.name)
except TypeError:
@ -194,7 +197,7 @@ class Argument(object):
values = source.getlist(name)
else:
values = source.get(name)
if not (isinstance(values, collections.MutableSequence) and self.action == 'append'):
if not (isinstance(values, MutableSequence) and self.action == 'append'):
values = [values]
for value in values:

@ -1,9 +1,9 @@
import sys
try:
from collections import OrderedDict
from collections.abc import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from collections import OrderedDict
from werkzeug.http import HTTP_STATUS_CODES

@ -68,7 +68,7 @@ See: http://python-future.org
Credits
-------
:Author: Ed Schofield
:Author: Ed Schofield, Jordan M. Adler, et al
:Sponsor: Python Charmers Pty Ltd, Australia, and Python Charmers Pte
Ltd, Singapore. http://pythoncharmers.com
:Others: See docs/credits.rst or http://python-future.org/credits.html
@ -76,7 +76,7 @@ Credits
Licensing
---------
Copyright 2013-2018 Python Charmers Pty Ltd, Australia.
Copyright 2013-2019 Python Charmers Pty Ltd, Australia.
The software is distributed under an MIT licence. See LICENSE.txt.
"""
@ -84,10 +84,10 @@ The software is distributed under an MIT licence. See LICENSE.txt.
__title__ = 'future'
__author__ = 'Ed Schofield'
__license__ = 'MIT'
__copyright__ = 'Copyright 2013-2018 Python Charmers Pty Ltd'
__copyright__ = 'Copyright 2013-2019 Python Charmers Pty Ltd'
__ver_major__ = 0
__ver_minor__ = 17
__ver_patch__ = 0
__ver_minor__ = 18
__ver_patch__ = 2
__ver_sub__ = ''
__version__ = "%d.%d.%d%s" % (__ver_major__, __ver_minor__,
__ver_patch__, __ver_sub__)

@ -10,7 +10,7 @@ __future_module__ = True
from future.standard_library import import_top_level_modules
if sys.version_info[0] == 3:
if sys.version_info[0] >= 3:
import_top_level_modules()

@ -800,7 +800,7 @@ class Message(object):
# There was no Content-Type header, and we don't know what type
# to set it to, so raise an exception.
raise errors.HeaderParseError('No Content-Type header found')
newparams = []
newparams = list()
foundp = False
for pk, pv in params:
if pk.lower() == 'boundary':
@ -814,10 +814,10 @@ class Message(object):
# instead???
newparams.append(('boundary', '"%s"' % boundary))
# Replace the existing Content-Type header with the new value
newheaders = []
newheaders = list()
for h, v in self._headers:
if h.lower() == 'content-type':
parts = []
parts = list()
for k, v in newparams:
if v == '':
parts.append(k)

@ -79,11 +79,15 @@ from future.backports.misc import create_connection as socket_create_connection
import io
import os
import socket
import collections
from future.backports.urllib.parse import urlsplit
import warnings
from array import array
if PY2:
from collections import Iterable
else:
from collections.abc import Iterable
__all__ = ["HTTPResponse", "HTTPConnection",
"HTTPException", "NotConnected", "UnknownProtocol",
"UnknownTransferEncoding", "UnimplementedFileMode",
@ -696,9 +700,19 @@ class HTTPResponse(io.RawIOBase):
while total_bytes < len(b):
if MAXAMOUNT < len(mvb):
temp_mvb = mvb[0:MAXAMOUNT]
n = self.fp.readinto(temp_mvb)
if PY2:
data = self.fp.read(len(temp_mvb))
n = len(data)
temp_mvb[:n] = data
else:
n = self.fp.readinto(temp_mvb)
else:
n = self.fp.readinto(mvb)
if PY2:
data = self.fp.read(len(mvb))
n = len(data)
mvb[:n] = data
else:
n = self.fp.readinto(mvb)
if not n:
raise IncompleteRead(bytes(mvb[0:total_bytes]), len(b))
mvb = mvb[n:]
@ -892,7 +906,7 @@ class HTTPConnection(object):
try:
self.sock.sendall(data)
except TypeError:
if isinstance(data, collections.Iterable):
if isinstance(data, Iterable):
for d in data:
self.sock.sendall(d)
else:

@ -33,7 +33,7 @@ from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from future.builtins import filter, int, map, open, str
from future.utils import as_native_str
from future.utils import as_native_str, PY2
__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
@ -41,7 +41,8 @@ __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
import copy
import datetime
import re
re.ASCII = 0
if PY2:
re.ASCII = 0
import time
from future.backports.urllib.parse import urlparse, urlsplit, quote
from future.backports.http.client import HTTP_PORT

@ -138,7 +138,8 @@ from future.utils import PY2, as_native_str
# Import our required modules
#
import re
re.ASCII = 0 # for py2 compatibility
if PY2:
re.ASCII = 0 # for py2 compatibility
import string
__all__ = ["CookieError", "BaseCookie", "SimpleCookie"]

@ -16,7 +16,6 @@ from __future__ import absolute_import
import subprocess
from math import ceil as oldceil
from collections import Mapping, MutableMapping
from operator import itemgetter as _itemgetter, eq as _eq
import sys
@ -25,7 +24,12 @@ from _weakref import proxy as _proxy
from itertools import repeat as _repeat, chain as _chain, starmap as _starmap
from socket import getaddrinfo, SOCK_STREAM, error, socket
from future.utils import iteritems, itervalues, PY26, PY3
from future.utils import iteritems, itervalues, PY2, PY26, PY3
if PY2:
from collections import Mapping, MutableMapping
else:
from collections.abc import Mapping, MutableMapping
def ceil(x):

@ -109,11 +109,17 @@ import re
import socket
import sys
import time
import collections
import tempfile
import contextlib
import warnings
from future.utils import PY2
if PY2:
from collections import Iterable
else:
from collections.abc import Iterable
# check for SSL
try:
import ssl
@ -1221,7 +1227,7 @@ class AbstractHTTPHandler(BaseHandler):
mv = memoryview(data)
size = len(mv) * mv.itemsize
except TypeError:
if isinstance(data, collections.Iterable):
if isinstance(data, Iterable):
raise ValueError("Content-Length should be specified "
"for iterable data of type %r %r" % (type(data),
data))

@ -11,7 +11,7 @@ from future.builtins.iterators import (filter, map, zip)
# The isinstance import is no longer needed. We provide it only for
# backward-compatibility with future v0.8.2. It will be removed in future v1.0.
from future.builtins.misc import (ascii, chr, hex, input, isinstance, next,
oct, open, pow, round, super)
oct, open, pow, round, super, max, min)
from future.utils import PY3
if PY3:
@ -43,7 +43,7 @@ if not utils.PY3:
__all__ = ['filter', 'map', 'zip',
'ascii', 'chr', 'hex', 'input', 'next', 'oct', 'open', 'pow',
'round', 'super',
'bytes', 'dict', 'int', 'list', 'object', 'range', 'str',
'bytes', 'dict', 'int', 'list', 'object', 'range', 'str', 'max', 'min'
]
else:

@ -13,6 +13,8 @@ The builtin functions are:
- ``open`` (equivalent to io.open on Py2)
- ``super`` (backport of Py3's magic zero-argument super() function
- ``round`` (new "Banker's Rounding" behaviour from Py3)
- ``max`` (new default option from Py3.4)
- ``min`` (new default option from Py3.4)
``isinstance`` is also currently exported for backwards compatibility
with v0.8.2, although this has been deprecated since v0.9.
@ -59,6 +61,8 @@ if utils.PY2:
from future.builtins.newnext import newnext as next
from future.builtins.newround import newround as round
from future.builtins.newsuper import newsuper as super
from future.builtins.new_min_max import newmax as max
from future.builtins.new_min_max import newmin as min
from future.types.newint import newint
_SENTINEL = object()
@ -89,11 +93,12 @@ if utils.PY2:
else:
return _builtin_pow(x+0j, y, z)
# ``future`` doesn't support Py3.0/3.1. If we ever did, we'd add this:
# callable = __builtin__.callable
__all__ = ['ascii', 'chr', 'hex', 'input', 'isinstance', 'next', 'oct',
'open', 'pow', 'round', 'super']
'open', 'pow', 'round', 'super', 'max', 'min']
else:
import builtins
@ -109,8 +114,14 @@ else:
pow = builtins.pow
round = builtins.round
super = builtins.super
__all__ = []
if utils.PY34_PLUS:
max = builtins.max
min = builtins.min
__all__ = []
else:
from future.builtins.new_min_max import newmax as max
from future.builtins.new_min_max import newmin as min
__all__ = ['min', 'max']
# The callable() function was removed from Py3.0 and 3.1 and
# reintroduced into Py3.2+. ``future`` doesn't support Py3.0/3.1. If we ever

@ -0,0 +1,59 @@
import itertools
from future import utils
if utils.PY2:
from __builtin__ import max as _builtin_max, min as _builtin_min
else:
from builtins import max as _builtin_max, min as _builtin_min
_SENTINEL = object()
def newmin(*args, **kwargs):
return new_min_max(_builtin_min, *args, **kwargs)
def newmax(*args, **kwargs):
return new_min_max(_builtin_max, *args, **kwargs)
def new_min_max(_builtin_func, *args, **kwargs):
"""
To support the argument "default" introduced in python 3.4 for min and max
:param _builtin_func: builtin min or builtin max
:param args:
:param kwargs:
:return: returns the min or max based on the arguments passed
"""
for key, _ in kwargs.items():
if key not in set(['key', 'default']):
raise TypeError('Illegal argument %s', key)
if len(args) == 0:
raise TypeError
if len(args) != 1 and kwargs.get('default', _SENTINEL) is not _SENTINEL:
raise TypeError
if len(args) == 1:
iterator = iter(args[0])
try:
first = next(iterator)
except StopIteration:
if kwargs.get('default', _SENTINEL) is not _SENTINEL:
return kwargs.get('default')
else:
raise ValueError('{}() arg is an empty sequence'.format(_builtin_func.__name__))
else:
iterator = itertools.chain([first], iterator)
if kwargs.get('key') is not None:
return _builtin_func(iterator, key=kwargs.get('key'))
else:
return _builtin_func(iterator)
if len(args) > 1:
if kwargs.get('key') is not None:
return _builtin_func(args, key=kwargs.get('key'))
else:
return _builtin_func(args)

@ -38,11 +38,14 @@ def newround(number, ndigits=None):
if 'numpy' in repr(type(number)):
number = float(number)
if not PY26:
d = Decimal.from_float(number).quantize(exponent,
rounding=ROUND_HALF_EVEN)
if isinstance(number, Decimal):
d = number
else:
d = from_float_26(number).quantize(exponent, rounding=ROUND_HALF_EVEN)
if not PY26:
d = Decimal.from_float(number).quantize(exponent,
rounding=ROUND_HALF_EVEN)
else:
d = from_float_26(number).quantize(exponent, rounding=ROUND_HALF_EVEN)
if return_int:
return int(d)

@ -4,5 +4,5 @@ import sys
__future_module__ = True
from future.standard_library import import_top_level_modules
if sys.version_info[0] == 3:
if sys.version_info[0] >= 3:
import_top_level_modules()

@ -2,7 +2,11 @@ from __future__ import absolute_import
from future.utils import PY3
if PY3:
from copyreg import *
import copyreg, sys
# A "*" import uses Python 3's copyreg.__all__ which does not include
# all public names in the API surface for copyreg, this avoids that
# problem by just making our module _be_ a reference to the actual module.
sys.modules['future.moves.copyreg'] = copyreg
else:
__future_module__ = True
from copy_reg import *

@ -11,19 +11,8 @@ if PY3:
proxy_bypass,
quote,
request_host,
splitattr,
splithost,
splitpasswd,
splitport,
splitquery,
splittag,
splittype,
splituser,
splitvalue,
thishost,
to_bytes,
unquote,
unwrap,
url2pathname,
urlcleanup,
urljoin,
@ -32,6 +21,18 @@ if PY3:
urlretrieve,
urlsplit,
urlunparse)
from urllib.parse import (splitattr,
splithost,
splitpasswd,
splitport,
splitquery,
splittag,
splittype,
splituser,
splitvalue,
to_bytes,
unwrap)
else:
__future_module__ = True
with suspend_hooks():

@ -272,7 +272,11 @@ class CodeHandler(unittest.TestCase):
else:
headers = ''
self.compare(output, headers + reformat_code(expected),
reformatted = reformat_code(expected)
if headers in reformatted:
headers = ''
self.compare(output, headers + reformatted,
ignore_imports=ignore_imports)
def unchanged(self, code, **kwargs):
@ -338,6 +342,10 @@ class CodeHandler(unittest.TestCase):
'----\n%s\n----' % f.read(),
)
ErrorClass = (FuturizeError if 'futurize' in script else PasteurizeError)
if not hasattr(e, 'output'):
# The attribute CalledProcessError.output doesn't exist on Py2.6
e.output = None
raise ErrorClass(msg, e.returncode, e.cmd, output=e.output)
return output

@ -5,15 +5,19 @@ Why do this? Without it, the Python 2 bytes object is a very, very
different beast to the Python 3 bytes object.
"""
from collections import Iterable
from numbers import Integral
import string
import copy
from future.utils import istext, isbytes, PY3, with_metaclass
from future.utils import istext, isbytes, PY2, PY3, with_metaclass
from future.types import no, issubset
from future.types.newobject import newobject
if PY2:
from collections import Iterable
else:
from collections.abc import Iterable
_builtin_bytes = bytes

@ -8,7 +8,6 @@ They are very similar. The most notable difference is:
from __future__ import division
import struct
import collections
from future.types.newbytes import newbytes
from future.types.newobject import newobject
@ -17,6 +16,9 @@ from future.utils import PY3, isint, istext, isbytes, with_metaclass, native
if PY3:
long = int
from collections.abc import Iterable
else:
from collections import Iterable
class BaseNewInt(type):
@ -356,7 +358,7 @@ class newint(with_metaclass(BaseNewInt, long)):
raise TypeError("cannot convert unicode objects to bytes")
# mybytes can also be passed as a sequence of integers on Py3.
# Test for this:
elif isinstance(mybytes, collections.Iterable):
elif isinstance(mybytes, Iterable):
mybytes = newbytes(mybytes)
b = mybytes if byteorder == 'big' else mybytes[::-1]
if len(b) == 0:

@ -1,14 +1,16 @@
"""
A pretty lame implementation of a memoryview object for Python 2.6.
"""
from collections import Iterable
from numbers import Integral
import string
from future.utils import istext, isbytes, PY3, with_metaclass
from future.utils import istext, isbytes, PY2, with_metaclass
from future.types import no, issubset
if PY2:
from collections import Iterable
else:
from collections.abc import Iterable
# class BaseNewBytes(type):
# def __instancecheck__(cls, instance):

@ -112,5 +112,6 @@ class newobject(object):
"""
return object(self)
__slots__ = []
__all__ = ['newobject']

@ -19,7 +19,12 @@ From Dan Crosta's README:
"""
from __future__ import absolute_import
from collections import Sequence, Iterator
from future.utils import PY2
if PY2:
from collections import Sequence, Iterator
else:
from collections.abc import Sequence, Iterator
from itertools import islice
from future.backports.misc import count # with step parameter on Py2.6

@ -40,7 +40,6 @@ representations of your objects portably across Py3 and Py2, use the
"""
from collections import Iterable
from numbers import Number
from future.utils import PY3, istext, with_metaclass, isnewbytes
@ -51,6 +50,9 @@ from future.types.newobject import newobject
if PY3:
# We'll probably never use newstr on Py3 anyway...
unicode = str
from collections.abc import Iterable
else:
from collections import Iterable
class BaseNewStr(type):
@ -105,6 +107,7 @@ class newstr(with_metaclass(BaseNewStr, unicode)):
"""
Without the u prefix
"""
value = super(newstr, self).__repr__()
# assert value[0] == u'u'
return value[1:]
@ -290,7 +293,14 @@ class newstr(with_metaclass(BaseNewStr, unicode)):
isinstance(other, bytes) and not isnewbytes(other)):
return super(newstr, self).__eq__(other)
else:
return False
return NotImplemented
def __hash__(self):
if (isinstance(self, unicode) or
isinstance(self, bytes) and not isnewbytes(self)):
return super(newstr, self).__hash__()
else:
raise NotImplementedError()
def __ne__(self, other):
if (isinstance(other, unicode) or

@ -18,8 +18,10 @@ This module exports useful functions for 2/3 compatible code:
* types:
* text_type: unicode in Python 2, str in Python 3
* binary_type: str in Python 2, bytes in Python 3
* string_types: basestring in Python 2, str in Python 3
* binary_type: str in Python 2, bytes in Python 3
* integer_types: (int, long) in Python 2, int in Python 3
* class_types: (type, types.ClassType) in Python 2, type in Python 3
* bchr(c):
Take an integer and make a 1-character byte string
@ -55,7 +57,8 @@ import copy
import inspect
PY3 = sys.version_info[0] == 3
PY3 = sys.version_info[0] >= 3
PY34_PLUS = sys.version_info[0:2] >= (3, 4)
PY35_PLUS = sys.version_info[0:2] >= (3, 5)
PY36_PLUS = sys.version_info[0:2] >= (3, 6)
PY2 = sys.version_info[0] == 2
@ -405,12 +408,34 @@ if PY3:
allows re-raising exceptions with the cls value and traceback on
Python 2 and 3.
"""
if value is not None and isinstance(tp, Exception):
raise TypeError("instance exception may not have a separate value")
if value is not None:
exc = tp(value)
else:
if isinstance(tp, BaseException):
# If the first object is an instance, the type of the exception
# is the class of the instance, the instance itself is the value,
# and the second object must be None.
if value is not None:
raise TypeError("instance exception may not have a separate value")
exc = tp
elif isinstance(tp, type) and not issubclass(tp, BaseException):
# If the first object is a class, it becomes the type of the
# exception.
raise TypeError("class must derive from BaseException, not %s" % tp.__name__)
else:
# The second object is used to determine the exception value: If it
# is an instance of the class, the instance becomes the exception
# value. If the second object is a tuple, it is used as the argument
# list for the class constructor; if it is None, an empty argument
# list is used, and any other object is treated as a single argument
# to the constructor. The instance so created by calling the
# constructor is used as the exception value.
if isinstance(value, tp):
exc = value
elif isinstance(value, tuple):
exc = tp(*value)
elif value is None:
exc = tp()
else:
exc = tp(value)
if exc.__traceback__ is not tb:
raise exc.with_traceback(tb)
raise exc
@ -443,12 +468,14 @@ else:
e.__suppress_context__ = False
if isinstance(cause, type) and issubclass(cause, Exception):
e.__cause__ = cause()
e.__cause__.__traceback__ = sys.exc_info()[2]
e.__suppress_context__ = True
elif cause is None:
e.__cause__ = None
e.__suppress_context__ = True
elif isinstance(cause, BaseException):
e.__cause__ = cause
object.__setattr__(e.__cause__, '__traceback__', sys.exc_info()[2])
e.__suppress_context__ = True
else:
raise TypeError("exception causes must derive from BaseException")
@ -552,15 +579,14 @@ def isbytes(obj):
def isnewbytes(obj):
"""
Equivalent to the result of ``isinstance(obj, newbytes)`` were
``__instancecheck__`` not overridden on the newbytes subclass. In
other words, it is REALLY a newbytes instance, not a Py2 native str
Equivalent to the result of ``type(obj) == type(newbytes)``
in other words, it is REALLY a newbytes instance, not a Py2 native str
object?
Note that this does not cover subclasses of newbytes, and it is not
equivalent to ininstance(obj, newbytes)
"""
# TODO: generalize this so that it works with subclasses of newbytes
# Import is here to avoid circular imports:
from future.types.newbytes import newbytes
return type(obj) == newbytes
return type(obj).__name__ == 'newbytes'
def isint(obj):
@ -726,16 +752,16 @@ else:
__all__ = ['PY2', 'PY26', 'PY3', 'PYPY',
'as_native_str', 'bind_method', 'bord', 'bstr',
'bytes_to_native_str', 'encode_filename', 'ensure_new_type',
'exec_', 'get_next', 'getexception', 'implements_iterator',
'is_new_style', 'isbytes', 'isidentifier', 'isint',
'isnewbytes', 'istext', 'iteritems', 'iterkeys', 'itervalues',
'lfilter', 'listitems', 'listvalues', 'lmap', 'lrange',
'lzip', 'native', 'native_bytes', 'native_str',
'as_native_str', 'binary_type', 'bind_method', 'bord', 'bstr',
'bytes_to_native_str', 'class_types', 'encode_filename',
'ensure_new_type', 'exec_', 'get_next', 'getexception',
'implements_iterator', 'integer_types', 'is_new_style', 'isbytes',
'isidentifier', 'isint', 'isnewbytes', 'istext', 'iteritems',
'iterkeys', 'itervalues', 'lfilter', 'listitems', 'listvalues',
'lmap', 'lrange', 'lzip', 'native', 'native_bytes', 'native_str',
'native_str_to_bytes', 'old_div',
'python_2_unicode_compatible', 'raise_',
'raise_with_traceback', 'reraise', 'text_to_native_str',
'tobytes', 'viewitems', 'viewkeys', 'viewvalues',
'with_metaclass'
]
'raise_with_traceback', 'reraise', 'string_types',
'text_to_native_str', 'text_type', 'tobytes', 'viewitems',
'viewkeys', 'viewvalues', 'with_metaclass'
]

@ -32,4 +32,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
# this has to be at the top level, see how setup.py parses this
#: Distribution version number.
__version__ = "1.0.1"
__version__ = "1.1"

@ -136,6 +136,7 @@ def normaliseCharList(charList):
i += j
return rv
# We don't really support characters above the BMP :(
max_unicode = int("FFFF", 16)
@ -254,7 +255,7 @@ class InfosetFilter(object):
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
warnings.warn("Coercing non-XML name", DataLossWarning)
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
@ -262,7 +263,7 @@ class InfosetFilter(object):
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
warnings.warn("Coercing non-XML name", DataLossWarning)
warnings.warn("Coercing non-XML name: %s" % name, DataLossWarning)
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput

@ -1,10 +1,11 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, binary_type
from six import text_type
from six.moves import http_client, urllib
import codecs
import re
from io import BytesIO, StringIO
import webencodings
@ -12,13 +13,6 @@ from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import _ReparseException
from . import _utils
from io import StringIO
try:
from io import BytesIO
except ImportError:
BytesIO = StringIO
# Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@ -40,13 +34,13 @@ if _utils.supports_lone_surrogates:
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF])
non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
0x10FFFE, 0x10FFFF}
ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
@ -367,7 +361,7 @@ class HTMLUnicodeInputStream(object):
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
if char is not None:
if char is not EOF:
if self.chunkOffset == 0:
# unget is called quite rarely, so it's a good idea to do
# more work here if it saves a bit of work in the frequently
@ -449,7 +443,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
try:
stream.seek(stream.tell())
except: # pylint:disable=bare-except
except Exception:
stream = BufferedStream(stream)
return stream
@ -461,7 +455,7 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
if charEncoding[0] is not None:
return charEncoding
# If we've been overriden, we've been overriden
# If we've been overridden, we've been overridden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
@ -664,9 +658,7 @@ class EncodingBytes(bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p + len(bytes)]
rv = data.startswith(bytes)
rv = self.startswith(bytes, self.position)
if rv:
self.position += len(bytes)
return rv
@ -674,15 +666,11 @@ class EncodingBytes(bytes):
def jumpTo(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
# XXX: This is ugly, but I can't see a nicer way to fix this.
if self._position == -1:
self._position = 0
self._position += (newPosition + len(bytes) - 1)
return True
else:
try:
self._position = self.index(bytes, self.position) + len(bytes) - 1
except ValueError:
raise StopIteration
return True
class EncodingParser(object):
@ -694,6 +682,9 @@ class EncodingParser(object):
self.encoding = None
def getEncoding(self):
if b"<meta" not in self.data:
return None
methodDispatch = (
(b"<!--", self.handleComment),
(b"<meta", self.handleMeta),
@ -703,6 +694,10 @@ class EncodingParser(object):
(b"<", self.handlePossibleStartTag))
for _ in self.data:
keepParsing = True
try:
self.data.jumpTo(b"<")
except StopIteration:
break
for key, method in methodDispatch:
if self.data.matchBytes(key):
try:
@ -908,7 +903,7 @@ class ContentAttrParser(object):
def lookupEncoding(encoding):
"""Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding."""
if isinstance(encoding, binary_type):
if isinstance(encoding, bytes):
try:
encoding = encoding.decode("ascii")
except UnicodeDecodeError:

@ -2,7 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
from six import unichr as chr
from collections import deque
from collections import deque, OrderedDict
from sys import version_info
from .constants import spaceCharacters
from .constants import entities
@ -17,6 +18,11 @@ from ._trie import Trie
entitiesTrie = Trie(entities)
if version_info >= (3, 7):
attributeMap = dict
else:
attributeMap = OrderedDict
class HTMLTokenizer(object):
""" This class takes care of tokenizing HTML.
@ -228,6 +234,14 @@ class HTMLTokenizer(object):
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
if len(raw) > len(data):
# we had some duplicated attribute, fix so first wins
data.update(raw[::-1])
token["data"] = data
if token["type"] == tokenTypes["EndTag"]:
if token["data"]:
self.tokenQueue.append({"type": tokenTypes["ParseError"],

@ -1,14 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from .py import Trie as PyTrie
from .py import Trie
Trie = PyTrie
# pylint:disable=wrong-import-position
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie
# pylint:enable=wrong-import-position
__all__ = ["Trie"]

@ -1,6 +1,9 @@
from __future__ import absolute_import, division, unicode_literals
from collections import Mapping
try:
from collections.abc import Mapping
except ImportError: # Python 2.7
from collections import Mapping
class Trie(Mapping):

@ -1,44 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from datrie import Trie as DATrie
from six import text_type
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
chars = set()
for key in data.keys():
if not isinstance(key, text_type):
raise TypeError("All keys must be strings")
for char in key:
chars.add(char)
self._data = DATrie("".join(chars))
for key, value in data.items():
self._data[key] = value
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
raise NotImplementedError()
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
return self._data.keys(prefix)
def has_keys_with_prefix(self, prefix):
return self._data.has_keys_with_prefix(prefix)
def longest_prefix(self, prefix):
return self._data.longest_prefix(prefix)
def longest_prefix_item(self, prefix):
return self._data.longest_prefix_item(prefix)

@ -2,12 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
from types import ModuleType
from six import text_type
try:
import xml.etree.cElementTree as default_etree
from collections.abc import Mapping
except ImportError:
from collections import Mapping
from six import text_type, PY3
if PY3:
import xml.etree.ElementTree as default_etree
else:
try:
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@ -27,7 +35,7 @@ try:
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"') # pylint:disable=eval-used
assert isinstance(_x, text_type)
except: # pylint:disable=bare-except
except Exception:
supports_lone_surrogates = False
else:
supports_lone_surrogates = True
@ -47,9 +55,6 @@ class MethodDispatcher(dict):
"""
def __init__(self, items=()):
# Using _dictEntries instead of directly assigning to self is about
# twice as fast. Please do careful performance testing before changing
# anything here.
_dictEntries = []
for name, value in items:
if isinstance(name, (list, tuple, frozenset, set)):
@ -64,6 +69,36 @@ class MethodDispatcher(dict):
def __getitem__(self, key):
return dict.get(self, key, self.default)
def __get__(self, instance, owner=None):
return BoundMethodDispatcher(instance, self)
class BoundMethodDispatcher(Mapping):
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
def __init__(self, instance, dispatcher):
self.instance = instance
self.dispatcher = dispatcher
def __getitem__(self, key):
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
# on a function, __get__ is used to bind a function to an instance as a bound method
return self.dispatcher[key].__get__(self.instance)
def get(self, key, default):
if key in self.dispatcher:
return self[key]
else:
return default
def __iter__(self):
return iter(self.dispatcher)
def __len__(self):
return len(self.dispatcher)
def __contains__(self, key):
return key in self.dispatcher
# Some utility functions to deal with weirdness around UCS2 vs UCS4
# python builds

@ -519,8 +519,8 @@ adjustForeignAttributes = {
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}
unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()])
unadjustForeignAttributes = {(ns, local): qname for qname, (prefix, local, ns) in
adjustForeignAttributes.items()}
spaceCharacters = frozenset([
"\t",
@ -544,8 +544,7 @@ asciiLetters = frozenset(string.ascii_letters)
digits = frozenset(string.digits)
hexDigits = frozenset(string.hexdigits)
asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
for c in string.ascii_uppercase])
asciiUpper2Lower = {ord(c): ord(c.lower()) for c in string.ascii_uppercase}
# Heading elements need to be ordered
headingElements = (
@ -2934,7 +2933,7 @@ tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]])
prefixes = dict([(v, k) for k, v in namespaces.items()])
prefixes = {v: k for k, v in namespaces.items()}
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"

@ -1,6 +1,15 @@
"""Deprecated from html5lib 1.1.
See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
is recommended as a replacement. Please let us know in the aforementioned issue
if Bleach is unsuitable for your needs.
"""
from __future__ import absolute_import, division, unicode_literals
import re
import warnings
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
@ -11,6 +20,14 @@ from ..constants import namespaces, prefixes
__all__ = ["Filter"]
_deprecation_msg = (
"html5lib's sanitizer is deprecated; see " +
"https://github.com/html5lib/html5lib-python/issues/443 and please let " +
"us know if Bleach is unsuitable for your needs"
)
warnings.warn(_deprecation_msg, DeprecationWarning)
allowed_elements = frozenset((
(namespaces['html'], 'a'),
(namespaces['html'], 'abbr'),
@ -750,6 +767,9 @@ class Filter(base.Filter):
"""
super(Filter, self).__init__(source)
warnings.warn(_deprecation_msg, DeprecationWarning)
self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes
self.allowed_css_properties = allowed_css_properties

@ -2,7 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass, viewkeys
import types
from collections import OrderedDict
from . import _inputstream
from . import _tokenizer
@ -119,8 +118,8 @@ class HTMLParser(object):
self.tree = tree(namespaceHTMLElements)
self.errors = []
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()])
self.phases = {name: cls(self, self.tree) for name, cls in
getPhases(debug).items()}
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
@ -202,7 +201,7 @@ class HTMLParser(object):
DoctypeToken = tokenTypes["Doctype"]
ParseErrorToken = tokenTypes["ParseError"]
for token in self.normalizedTokens():
for token in self.tokenizer:
prev_token = None
new_token = token
while new_token is not None:
@ -260,10 +259,6 @@ class HTMLParser(object):
if reprocess:
assert self.phase not in phases
def normalizedTokens(self):
for token in self.tokenizer:
yield self.normalizeToken(token)
def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree
@ -325,17 +320,6 @@ class HTMLParser(object):
if self.strict:
raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token):
# HTML5 specific normalizations to the token stream
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
token["data"] = OrderedDict(raw)
if len(raw) > len(token["data"]):
# we had some duplicated attribute, fix so first wins
token["data"].update(raw[::-1])
return token
def adjustMathMLAttributes(self, token):
adjust_attributes(token, adjustMathMLAttributes)
@ -413,16 +397,12 @@ class HTMLParser(object):
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in
tokenTypes.items())
type_names = {value: key for key, value in tokenTypes.items()}
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
token = args[0]
try:
info = {"type": type_names[token['type']]}
except:
raise
info = {"type": type_names[token['type']]}
if token['type'] in tagTokenTypes:
info["name"] = token['name']
@ -446,10 +426,13 @@ def getPhases(debug):
class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing
"""
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
self.__startTagCache = {}
self.__endTagCache = {}
def processEOF(self):
raise NotImplementedError
@ -469,7 +452,21 @@ def getPhases(debug):
self.tree.insertText(token["data"])
def processStartTag(self, token):
return self.startTagHandler[token["name"]](token)
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__startTagCache:
func = self.__startTagCache[name]
else:
func = self.__startTagCache[name] = self.startTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__startTagCache.pop(next(iter(self.__startTagCache)))
return func(token)
def startTagHtml(self, token):
if not self.parser.firstStartTag and token["name"] == "html":
@ -482,9 +479,25 @@ def getPhases(debug):
self.parser.firstStartTag = False
def processEndTag(self, token):
return self.endTagHandler[token["name"]](token)
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__endTagCache:
func = self.__endTagCache[name]
else:
func = self.__endTagCache[name] = self.endTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__endTagCache.pop(next(iter(self.__endTagCache)))
return func(token)
class InitialPhase(Phase):
__slots__ = tuple()
def processSpaceCharacters(self, token):
pass
@ -613,6 +626,8 @@ def getPhases(debug):
return True
class BeforeHtmlPhase(Phase):
__slots__ = tuple()
# helper methods
def insertHtmlElement(self):
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
@ -648,19 +663,7 @@ def getPhases(debug):
return token
class BeforeHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
def processEOF(self):
self.startTagHead(impliedTagToken("head", "StartTag"))
@ -693,28 +696,19 @@ def getPhases(debug):
self.parser.parseError("end-tag-after-implied-root",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), endTagImplyHead)
])
endTagHandler.default = endTagOther
class InHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("title", self.startTagTitle),
(("noframes", "style"), self.startTagNoFramesStyle),
("noscript", self.startTagNoscript),
("script", self.startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("head", self.endTagHead),
(("br", "html", "body"), self.endTagHtmlBodyBr)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
# the real thing
def processEOF(self):
@ -796,22 +790,27 @@ def getPhases(debug):
def anythingElse(self):
self.endTagHead(impliedTagToken("head"))
class InHeadNoscriptPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("title", startTagTitle),
(("noframes", "style"), startTagNoFramesStyle),
("noscript", startTagNoscript),
("script", startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
startTagBaseLinkCommand),
("meta", startTagMeta),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("head", endTagHead),
(("br", "html", "body"), endTagHtmlBodyBr)
])
endTagHandler.default = endTagOther
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
(("head", "noscript"), self.startTagHeadNoscript),
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("noscript", self.endTagNoscript),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
class InHeadNoscriptPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.parser.parseError("eof-in-head-noscript")
@ -860,23 +859,21 @@ def getPhases(debug):
# Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript"))
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
(("head", "noscript"), startTagHeadNoscript),
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("noscript", endTagNoscript),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
self.startTagFromHead),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
self.endTagHtmlBodyBr)])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
def processEOF(self):
self.anythingElse()
@ -927,80 +924,30 @@ def getPhases(debug):
self.parser.phase = self.parser.phases["inBody"]
self.parser.framesetOK = True
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("body", startTagBody),
("frameset", startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
startTagFromHead),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
endTagHtmlBodyBr)])
endTagHandler.default = endTagOther
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
__slots__ = ("processSpaceCharacters",)
def __init__(self, *args, **kwargs):
super(InBodyPhase, self).__init__(*args, **kwargs)
# Set this to the default handler
self.processSpaceCharacters = self.processSpaceCharactersNonPre
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
self.startTagCloseP),
(headingElements, self.startTagHeading),
(("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext", self.startTagPlaintext),
("a", self.startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), self.startTagFormatting),
("nobr", self.startTagNobr),
("button", self.startTagButton),
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
self.startTagVoidFormatting),
(("param", "source", "track"), self.startTagParamSource),
("input", self.startTagInput),
("hr", self.startTagHr),
("image", self.startTagImage),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
("noscript", self.startTagNoscript),
(("noembed", "noframes"), self.startTagRawtext),
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
(("math"), self.startTagMath),
(("svg"), self.startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), self.startTagMisplaced)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("body", self.endTagBody),
("html", self.endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), self.endTagBlock),
("form", self.endTagForm),
("p", self.endTagP),
(("dd", "dt", "li"), self.endTagListItem),
(headingElements, self.endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def isMatchingFormattingElement(self, node1, node2):
return (node1.name == node2.name and
node1.namespace == node2.namespace and
@ -1650,14 +1597,73 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
break
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
startTagProcessInHead),
("body", startTagBody),
("frameset", startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
startTagCloseP),
(headingElements, startTagHeading),
(("pre", "listing"), startTagPreListing),
("form", startTagForm),
(("li", "dd", "dt"), startTagListItem),
("plaintext", startTagPlaintext),
("a", startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), startTagFormatting),
("nobr", startTagNobr),
("button", startTagButton),
(("applet", "marquee", "object"), startTagAppletMarqueeObject),
("xmp", startTagXmp),
("table", startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
startTagVoidFormatting),
(("param", "source", "track"), startTagParamSource),
("input", startTagInput),
("hr", startTagHr),
("image", startTagImage),
("isindex", startTagIsIndex),
("textarea", startTagTextarea),
("iframe", startTagIFrame),
("noscript", startTagNoscript),
(("noembed", "noframes"), startTagRawtext),
("select", startTagSelect),
(("rp", "rt"), startTagRpRt),
(("option", "optgroup"), startTagOpt),
(("math"), startTagMath),
(("svg"), startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), startTagMisplaced)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("body", endTagBody),
("html", endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), endTagBlock),
("form", endTagForm),
("p", endTagP),
(("dd", "dt", "li"), endTagListItem),
(headingElements, endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), endTagFormatting),
(("applet", "marquee", "object"), endTagAppletMarqueeObject),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class TextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("script", self.endTagScript)])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
def processCharacters(self, token):
self.tree.insertText(token["data"])
@ -1683,30 +1689,15 @@ def getPhases(debug):
self.tree.openElements.pop()
self.parser.phase = self.parser.originalPhase
startTagHandler = _utils.MethodDispatcher([])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("script", endTagScript)])
endTagHandler.default = endTagOther
class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("caption", self.startTagCaption),
("colgroup", self.startTagColgroup),
("col", self.startTagCol),
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
(("td", "th", "tr"), self.startTagImplyTbody),
("table", self.startTagTable),
(("style", "script"), self.startTagStyleScript),
("input", self.startTagInput),
("form", self.startTagForm)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
# helper methods
def clearStackToTableContext(self):
@ -1828,9 +1819,32 @@ def getPhases(debug):
self.parser.phases["inBody"].processEndTag(token)
self.tree.insertFromTable = False
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("caption", startTagCaption),
("colgroup", startTagColgroup),
("col", startTagCol),
(("tbody", "tfoot", "thead"), startTagRowGroup),
(("td", "th", "tr"), startTagImplyTbody),
("table", startTagTable),
(("style", "script"), startTagStyleScript),
("input", startTagInput),
("form", startTagForm)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("table", endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InTableTextPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
__slots__ = ("originalPhase", "characterTokens")
def __init__(self, *args, **kwargs):
super(InTableTextPhase, self).__init__(*args, **kwargs)
self.originalPhase = None
self.characterTokens = []
@ -1875,23 +1889,7 @@ def getPhases(debug):
class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableElement)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("caption", self.endTagCaption),
("table", self.endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", variant="table")
@ -1944,23 +1942,24 @@ def getPhases(debug):
def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), startTagTableElement)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("caption", endTagCaption),
("table", endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("col", self.startTagCol)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("colgroup", self.endTagColgroup),
("col", self.endTagCol)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html"
@ -2010,26 +2009,21 @@ def getPhases(debug):
if not ignoreEndTag:
return token
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("col", startTagCol)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("colgroup", endTagColgroup),
("col", endTagCol)
])
endTagHandler.default = endTagOther
class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
# helper methods
def clearStackToTableBodyContext(self):
@ -2108,26 +2102,26 @@ def getPhases(debug):
def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("tr", startTagTr),
(("td", "th"), startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), endTagTableRowGroup),
("table", endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("tr", self.endTagTr),
("table", self.endTagTable),
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
# helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self):
@ -2197,23 +2191,26 @@ def getPhases(debug):
def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("td", "th"), startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("tr", endTagTr),
("table", endTagTable),
(("tbody", "tfoot", "thead"), endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
endTagIgnore)
])
endTagHandler.default = endTagOther
class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("td", "th"), self.endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
# helper
def closeCell(self):
@ -2273,26 +2270,22 @@ def getPhases(debug):
def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("td", "th"), endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
])
endTagHandler.default = endTagOther
class InSelectPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("option", self.startTagOption),
("optgroup", self.startTagOptgroup),
("select", self.startTagSelect),
(("input", "keygen", "textarea"), self.startTagInput),
("script", self.startTagScript)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("option", self.endTagOption),
("optgroup", self.endTagOptgroup),
("select", self.endTagSelect)
])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processEOF(self):
@ -2373,21 +2366,25 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag-in-select",
{"name": token["name"]})
class InSelectInTablePhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.startTagTable)
])
self.startTagHandler.default = self.startTagOther
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("option", startTagOption),
("optgroup", startTagOptgroup),
("select", startTagSelect),
(("input", "keygen", "textarea"), startTagInput),
("script", startTagScript)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("option", endTagOption),
("optgroup", endTagOptgroup),
("select", endTagSelect)
])
endTagHandler.default = endTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.endTagTable)
])
self.endTagHandler.default = self.endTagOther
class InSelectInTablePhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.parser.phases["inSelect"].processEOF()
@ -2412,7 +2409,21 @@ def getPhases(debug):
def endTagOther(self, token):
return self.parser.phases["inSelect"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
startTagTable)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
endTagTable)
])
endTagHandler.default = endTagOther
class InForeignContentPhase(Phase):
__slots__ = tuple()
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
"center", "code", "dd", "div", "dl", "dt",
"em", "embed", "h1", "h2", "h3",
@ -2422,9 +2433,6 @@ def getPhases(debug):
"span", "strong", "strike", "sub", "sup",
"table", "tt", "u", "ul", "var"])
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
def adjustSVGTagNames(self, token):
replacements = {"altglyph": "altGlyph",
"altglyphdef": "altGlyphDef",
@ -2478,7 +2486,7 @@ def getPhases(debug):
currentNode = self.tree.openElements[-1]
if (token["name"] in self.breakoutElements or
(token["name"] == "font" and
set(token["data"].keys()) & set(["color", "face", "size"]))):
set(token["data"].keys()) & {"color", "face", "size"})):
self.parser.parseError("unexpected-html-element-in-foreign-content",
{"name": token["name"]})
while (self.tree.openElements[-1].namespace !=
@ -2528,16 +2536,7 @@ def getPhases(debug):
return new_token
class AfterBodyPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther
__slots__ = tuple()
def processEOF(self):
# Stop parsing
@ -2574,23 +2573,17 @@ def getPhases(debug):
self.parser.phase = self.parser.phases["inBody"]
return token
class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml)
])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("frameset", self.startTagFrameset),
("frame", self.startTagFrame),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
endTagHandler.default = endTagOther
self.endTagHandler = _utils.MethodDispatcher([
("frameset", self.endTagFrameset)
])
self.endTagHandler.default = self.endTagOther
class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
__slots__ = tuple()
def processEOF(self):
if self.tree.openElements[-1].name != "html":
@ -2631,21 +2624,22 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag-in-frameset",
{"name": token["name"]})
class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("frameset", startTagFrameset),
("frame", startTagFrame),
("noframes", startTagNoframes)
])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
endTagHandler = _utils.MethodDispatcher([
("frameset", endTagFrameset)
])
endTagHandler.default = endTagOther
self.endTagHandler = _utils.MethodDispatcher([
("html", self.endTagHtml)
])
self.endTagHandler.default = self.endTagOther
class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
__slots__ = tuple()
def processEOF(self):
# Stop parsing
@ -2668,14 +2662,19 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag-after-frameset",
{"name": token["name"]})
class AfterAfterBodyPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("noframes", startTagNoframes)
])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
endTagHandler = _utils.MethodDispatcher([
("html", endTagHtml)
])
endTagHandler.default = endTagOther
class AfterAfterBodyPhase(Phase):
__slots__ = tuple()
def processEOF(self):
pass
@ -2706,15 +2705,13 @@ def getPhases(debug):
self.parser.phase = self.parser.phases["inBody"]
return token
class AfterAfterFramesetPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml)
])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("noframes", self.startTagNoFrames)
])
self.startTagHandler.default = self.startTagOther
class AfterAfterFramesetPhase(Phase):
__slots__ = tuple()
def processEOF(self):
pass
@ -2741,6 +2738,13 @@ def getPhases(debug):
def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("noframes", startTagNoFrames)
])
startTagHandler.default = startTagOther
# pylint:enable=unused-argument
return {
@ -2774,8 +2778,8 @@ def getPhases(debug):
def adjust_attributes(token, replacements):
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
if needs_adjustment:
token['data'] = OrderedDict((replacements.get(k, k), v)
for k, v in token['data'].items())
token['data'] = type(token['data'])((replacements.get(k, k), v)
for k, v in token['data'].items())
def impliedTagToken(name, type="EndTag", attributes=None,

@ -274,7 +274,7 @@ class HTMLSerializer(object):
if token["systemId"]:
if token["systemId"].find('"') >= 0:
if token["systemId"].find("'") >= 0:
self.serializeError("System identifer contains both single and double quote characters")
self.serializeError("System identifier contains both single and double quote characters")
quote_char = "'"
else:
quote_char = '"'

@ -0,0 +1,433 @@
[
{
"name": "IE_Comments",
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
"output": ""
},
{
"name": "IE_Comments_2",
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
"output": "&lt;script&gt;alert('XSS');&lt;/script&gt;"
},
{
"name": "allow_colons_in_path_component",
"input": "<a href=\"./this:that\">foo</a>",
"output": "<a href='./this:that'>foo</a>"
},
{
"name": "background_attribute",
"input": "<div background=\"javascript:alert('XSS')\"></div>",
"output": "<div></div>"
},
{
"name": "bgsound",
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
"output": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
},
{
"name": "div_background_image_unicode_encoded",
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
"output": "<div style=''>foo</div>"
},
{
"name": "div_expression",
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
"output": "<div style=''>foo</div>"
},
{
"name": "double_open_angle_brackets",
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
"output": ""
},
{
"name": "double_open_angle_brackets_2",
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
"output": ""
},
{
"name": "grave_accents",
"input": "<img src=`javascript:alert('XSS')` />",
"output": "<img/>"
},
{
"name": "img_dynsrc_lowsrc",
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
"output": "<img/>"
},
{
"name": "img_vbscript",
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
"output": "<img/>"
},
{
"name": "input_image",
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
"output": "<input type='image'/>"
},
{
"name": "link_stylesheets",
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
"output": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"&gt;"
},
{
"name": "link_stylesheets_2",
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
"output": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"&gt;"
},
{
"name": "list_style_image",
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
"output": "<li style=''>foo</li>"
},
{
"name": "no_closing_script_tags",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
},
{
"name": "non_alpha_non_digit",
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
},
{
"name": "non_alpha_non_digit_2",
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
"output": "<a>foo</a>"
},
{
"name": "non_alpha_non_digit_3",
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
"output": "<img src='http://ha.ckers.org/xss.js'/>"
},
{
"name": "non_alpha_non_digit_II",
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
"output": "<a>foo</a>"
},
{
"name": "non_alpha_non_digit_III",
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
"output": "<a>foo</a>"
},
{
"name": "platypus",
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
},
{
"name": "protocol_resolution_in_script_tag",
"input": "<script src=//ha.ckers.org/.j></script>",
"output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;"
},
{
"name": "should_allow_anchors",
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
"output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
},
{
"name": "should_allow_image_alt_attribute",
"input": "<img alt='foo' onclick='bar' />",
"output": "<img alt='foo'/>"
},
{
"name": "should_allow_image_height_attribute",
"input": "<img height='foo' onclick='bar' />",
"output": "<img height='foo'/>"
},
{
"name": "should_allow_image_src_attribute",
"input": "<img src='foo' onclick='bar' />",
"output": "<img src='foo'/>"
},
{
"name": "should_allow_image_width_attribute",
"input": "<img width='foo' onclick='bar' />",
"output": "<img width='foo'/>"
},
{
"name": "should_handle_blank_text",
"input": "",
"output": ""
},
{
"name": "should_handle_malformed_image_tags",
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
"output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;"
},
{
"name": "should_handle_non_html",
"input": "abc",
"output": "abc"
},
{
"name": "should_not_fall_for_ridiculous_hack",
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_0",
"input": "<img src=\"javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_1",
"input": "<img src=javascript:alert('XSS') />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_10",
"input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_11",
"input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_12",
"input": "<img src=\" &#14; javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_13",
"input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_14",
"input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_2",
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_3",
"input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_4",
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_5",
"input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_6",
"input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_7",
"input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_8",
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_not_fall_for_xss_image_hack_9",
"input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
"output": "<img/>"
},
{
"name": "should_sanitize_half_open_scripts",
"input": "<img src=\"javascript:alert('XSS')\"",
"output": ""
},
{
"name": "should_sanitize_invalid_script_tag",
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
},
{
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
"input": "<<script>alert(\"XSS\");//<</script>",
"output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;"
},
{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
"output": ""
},
{
"name": "should_sanitize_tag_broken_up_by_null",
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
"output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;"
},
{
"name": "should_sanitize_unclosed_script",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
},
{
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
"output": "<a title='1'>boo</a>"
},
{
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
"output": "<a title='1'>boo</a>"
},
{
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
"output": "<img title='1'/>boo"
},
{
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
"output": "<img title='1'/>boo"
},
{
"name": "xml_base",
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
"output": "<div>foo</div>"
},
{
"name": "xul",
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
"output": "<p style=''>fubar</p>"
},
{
"name": "quotes_in_attributes",
"input": "<img src='foo' title='\"foo\" bar' />",
"output": "<img src='foo' title='\"foo\" bar'/>"
},
{
"name": "uri_refs_in_svg_attributes",
"input": "<svg><rect fill='url(#foo)' />",
"output": "<svg><rect fill='url(#foo)'></rect></svg>"
},
{
"name": "absolute_uri_refs_in_svg_attributes",
"input": "<svg><rect fill='url(http://bad.com/) #fff' />",
"output": "<svg><rect fill=' #fff'></rect></svg>"
},
{
"name": "uri_ref_with_space_in svg_attribute",
"input": "<svg><rect fill='url(\n#foo)' />",
"output": "<svg><rect fill='url(\n#foo)'></rect></svg>"
},
{
"name": "absolute_uri_ref_with_space_in svg_attribute",
"input": "<svg><rect fill=\"url(\nhttp://bad.com/)\" />",
"output": "<svg><rect fill=' '></rect></svg>"
},
{
"name": "allow_html5_image_tag",
"input": "<image src='foo' />",
"output": "<img src='foo'/>"
},
{
"name": "style_attr_end_with_nothing",
"input": "<div style=\"color: blue\" />",
"output": "<div style='color: blue;'></div>"
},
{
"name": "style_attr_end_with_space",
"input": "<div style=\"color: blue \" />",
"output": "<div style='color: blue ;'></div>"
},
{
"name": "style_attr_end_with_semicolon",
"input": "<div style=\"color: blue;\" />",
"output": "<div style='color: blue;'></div>"
},
{
"name": "style_attr_end_with_semicolon_space",
"input": "<div style=\"color: blue; \" />",
"output": "<div style='color: blue;'></div>"
},
{
"name": "attributes_with_embedded_quotes",
"input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
"output": "<img src='doesntexist.jpg\"&#39;onerror=\"alert(1)'/>"
},
{
"name": "attributes_with_embedded_quotes_II",
"input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
"output": "<img src='notthere.jpg\"\"onerror=\"alert(2)'/>"
}
]

@ -27,14 +27,15 @@ class SanitizerTest(pytest.Item):
expected = self.test["output"]
parsed = parseFragment(input)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char="'",
alphabetical_attributes=True)
with pytest.deprecated_call():
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char="'",
alphabetical_attributes=True)
errorMsg = "\n".join(["\n\nInput:", input,
"\nExpected:", expected,
"\nReceived:", serialized])

@ -0,0 +1,395 @@
{
"tests": [
{
"expected": [
"<span title='test \"with\" &amp;quot;'>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "test \"with\" &quot;"
}
]
]
],
"description": "proper attribute value escaping"
},
{
"expected": [
"<span title=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo"
}
]
]
],
"description": "proper attribute value non-quoting"
},
{
"expected": [
"<span title=\"foo<bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo<bar"
}
]
]
],
"description": "proper attribute value non-quoting (with <)"
},
{
"expected": [
"<span title=\"foo=bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo=bar"
}
]
]
],
"description": "proper attribute value quoting (with =)"
},
{
"expected": [
"<span title=\"foo>bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo>bar"
}
]
]
],
"description": "proper attribute value quoting (with >)"
},
{
"expected": [
"<span title='foo\"bar'>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\"bar"
}
]
]
],
"description": "proper attribute value quoting (with \")"
},
{
"expected": [
"<span title=\"foo'bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo'bar"
}
]
]
],
"description": "proper attribute value quoting (with ')"
},
{
"expected": [
"<span title=\"foo'bar&quot;baz\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo'bar\"baz"
}
]
]
],
"description": "proper attribute value quoting (with both \" and ')"
},
{
"expected": [
"<span title=\"foo bar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo bar"
}
]
]
],
"description": "proper attribute value quoting (with space)"
},
{
"expected": [
"<span title=\"foo\tbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\tbar"
}
]
]
],
"description": "proper attribute value quoting (with tab)"
},
{
"expected": [
"<span title=\"foo\nbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\nbar"
}
]
]
],
"description": "proper attribute value quoting (with LF)"
},
{
"expected": [
"<span title=\"foo\rbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\rbar"
}
]
]
],
"description": "proper attribute value quoting (with CR)"
},
{
"expected": [
"<span title=\"foo\u000bbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\u000bbar"
}
]
]
],
"description": "proper attribute value non-quoting (with linetab)"
},
{
"expected": [
"<span title=\"foo\fbar\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "foo\fbar"
}
]
]
],
"description": "proper attribute value quoting (with form feed)"
},
{
"expected": [
"<img>"
],
"input": [
[
"EmptyTag",
"img",
{}
]
],
"description": "void element (as EmptyTag token)"
},
{
"expected": [
"<!DOCTYPE foo>"
],
"input": [
[
"Doctype",
"foo"
]
],
"description": "doctype in error"
},
{
"expected": [
"a&lt;b&gt;c&amp;d"
],
"input": [
[
"Characters",
"a<b>c&d"
]
],
"description": "character data",
"options": {
"encoding": "utf-8"
}
},
{
"expected": [
"<script>a<b>c&d"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"script",
{}
],
[
"Characters",
"a<b>c&d"
]
],
"description": "rcdata"
},
{
"expected": [
"<!DOCTYPE HTML>"
],
"input": [
[
"Doctype",
"HTML"
]
],
"description": "doctype"
},
{
"expected": [
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"
],
"input": [
[
"Doctype",
"HTML",
"-//W3C//DTD HTML 4.01//EN",
"http://www.w3.org/TR/html4/strict.dtd"
]
],
"description": "HTML 4.01 DOCTYPE"
},
{
"expected": [
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"
],
"input": [
[
"Doctype",
"HTML",
"-//W3C//DTD HTML 4.01//EN"
]
],
"description": "HTML 4.01 DOCTYPE without system identifier"
},
{
"expected": [
"<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"
],
"input": [
[
"Doctype",
"html",
"",
"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
]
],
"description": "IBM DOCTYPE without public identifier"
}
]
}

@ -0,0 +1,350 @@
{
"tests": [
{
"expected": [
""
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "no encoding",
"options": {
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "empytag head",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8><title>foo</title>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"title",
{}
],
[
"Characters",
"foo"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"title"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/title",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/meta-charset",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8><meta charset=utf-8>",
"<head><meta charset=utf-8><meta charset=ascii>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/ two meta-charset",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta charset=utf-8><meta content=noindex name=robots>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "name",
"value": "robots"
},
{
"namespace": null,
"name": "content",
"value": "noindex"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/robots",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta content=noindex name=robots><meta charset=utf-8>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "name",
"value": "robots"
},
{
"namespace": null,
"name": "content",
"value": "noindex"
}
]
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "charset",
"value": "ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/robots & charset",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "http-equiv",
"value": "content-type"
},
{
"namespace": null,
"name": "content",
"value": "text/html; charset=ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/ charset in http-equiv content-type",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
},
{
"expected": [
"<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"head",
{}
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "name",
"value": "robots"
},
{
"namespace": null,
"name": "content",
"value": "noindex"
}
]
],
[
"EmptyTag",
"meta",
[
{
"namespace": null,
"name": "http-equiv",
"value": "content-type"
},
{
"namespace": null,
"name": "content",
"value": "text/html; charset=ascii"
}
]
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"head"
]
],
"description": "head w/robots & charset in http-equiv content-type",
"options": {
"encoding": "utf-8",
"inject_meta_charset": true
}
}
]
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,334 @@
{
"tests": [
{
"expected": [
"<span title='test &#39;with&#39; quote_char'>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
[
{
"namespace": null,
"name": "title",
"value": "test 'with' quote_char"
}
]
]
],
"description": "quote_char=\"'\"",
"options": {
"quote_char": "'"
}
},
{
"expected": [
"<button disabled>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"button",
[
{
"namespace": null,
"name": "disabled",
"value": "disabled"
}
]
]
],
"description": "quote_attr_values='always'",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div itemscope>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "itemscope",
"value": "itemscope"
}
]
]
],
"description": "quote_attr_values='always' with itemscope",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div irrelevant>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "irrelevant",
"value": "irrelevant"
}
]
]
],
"description": "quote_attr_values='always' with irrelevant",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=\"foo\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='always'",
"options": {
"quote_attr_values": "always"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='legacy'",
"options": {
"quote_attr_values": "legacy"
}
},
{
"expected": [
"<div class=foo>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "class",
"value": "foo"
}
]
]
],
"description": "non-minimized quote_attr_values='spec'",
"options": {
"quote_attr_values": "spec"
}
},
{
"expected": [
"<img />"
],
"input": [
[
"EmptyTag",
"img",
{}
]
],
"description": "use_trailing_solidus=true with void element",
"options": {
"use_trailing_solidus": true
}
},
{
"expected": [
"<div>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
{}
]
],
"description": "use_trailing_solidus=true with non-void element",
"options": {
"use_trailing_solidus": true
}
},
{
"expected": [
"<div itemscope=itemscope>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "itemscope",
"value": "itemscope"
}
]
]
],
"description": "minimize_boolean_attributes=false",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<div irrelevant=irrelevant>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "irrelevant",
"value": "irrelevant"
}
]
]
],
"description": "minimize_boolean_attributes=false",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<div itemscope=\"\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "itemscope",
"value": ""
}
]
]
],
"description": "minimize_boolean_attributes=false with empty value",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<div irrelevant=\"\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"div",
[
{
"namespace": null,
"name": "irrelevant",
"value": ""
}
]
]
],
"description": "minimize_boolean_attributes=false with empty value",
"options": {
"minimize_boolean_attributes": false
}
},
{
"expected": [
"<a title=\"a&lt;b>c&amp;d\">"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"a",
[
{
"namespace": null,
"name": "title",
"value": "a<b>c&d"
}
]
]
],
"description": "escape less than signs in attribute values",
"options": {
"escape_lt_in_attrs": true
}
},
{
"expected": [
"<script>a&lt;b&gt;c&amp;d"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"script",
{}
],
[
"Characters",
"a<b>c&d"
]
],
"description": "rcdata",
"options": {
"escape_rcdata": true
}
}
]
}

@ -0,0 +1,198 @@
{
"tests": [
{
"expected": [
" foo"
],
"input": [
[
"Characters",
"\t\r\n\f foo"
]
],
"description": "bare text with leading spaces",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"foo "
],
"input": [
[
"Characters",
"foo \t\r\n\f"
]
],
"description": "bare text with trailing spaces",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"foo bar"
],
"input": [
[
"Characters",
"foo \t\r\n\f bar"
]
],
"description": "bare text with inner spaces",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<pre>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</pre>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"pre",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"pre"
]
],
"description": "text within <pre>",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<pre>\t\r\n\f fo<span>o \t\r\n\f b</span>ar \t\r\n\f</pre>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"pre",
{}
],
[
"Characters",
"\t\r\n\f fo"
],
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"span",
{}
],
[
"Characters",
"o \t\r\n\f b"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"span"
],
[
"Characters",
"ar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"pre"
]
],
"description": "text within <pre>, with inner markup",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<textarea>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</textarea>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"textarea",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"textarea"
]
],
"description": "text within <textarea>",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<script>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</script>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"script",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"script"
]
],
"description": "text within <script>",
"options": {
"strip_whitespace": true
}
},
{
"expected": [
"<style>\t\r\n\f foo \t\r\n\f bar \t\r\n\f</style>"
],
"input": [
[
"StartTag",
"http://www.w3.org/1999/xhtml",
"style",
{}
],
[
"Characters",
"\t\r\n\f foo \t\r\n\f bar \t\r\n\f"
],
[
"EndTag",
"http://www.w3.org/1999/xhtml",
"style"
]
],
"description": "text within <style>",
"options": {
"strip_whitespace": true
}
}
]
}

@ -143,11 +143,12 @@ def convert(stripChars):
return "\n".join(rv)
return convertData
convertExpected = convert(2)
def errorMessage(input, expected, actual):
msg = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n" %
msg = ("Input:\n%s\nExpected:\n%s\nReceived\n%s\n" %
(repr(input), repr(expected), repr(actual)))
if sys.version_info[0] == 2:
msg = msg.encode("ascii", "backslashreplace")

@ -75,7 +75,15 @@ def test_parser_args_raises(kwargs):
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
def runParserEncodingTest(data, encoding):
def param_encoding():
for filename in get_data_files("encoding"):
tests = _TestData(filename, b"data", encoding=None)
for test in tests:
yield test[b'data'], test[b'encoding']
@pytest.mark.parametrize("data, encoding", param_encoding())
def test_parser_encoding(data, encoding):
p = HTMLParser()
assert p.documentEncoding is None
p.parse(data, useChardet=False)
@ -84,7 +92,8 @@ def runParserEncodingTest(data, encoding):
assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def runPreScanEncodingTest(data, encoding):
@pytest.mark.parametrize("data, encoding", param_encoding())
def test_prescan_encoding(data, encoding):
stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
encoding = encoding.lower().decode("ascii")
@ -95,14 +104,6 @@ def runPreScanEncodingTest(data, encoding):
assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name)
def test_encoding():
for filename in get_data_files("encoding"):
tests = _TestData(filename, b"data", encoding=None)
for test in tests:
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
# pylint:disable=wrong-import-position
try:
import chardet # noqa

@ -28,10 +28,10 @@ def test_errorMessage():
# Assertions!
if six.PY2:
assert b"Input:\n1\nExpected:\n2\nRecieved\n3\n" == r
assert b"Input:\n1\nExpected:\n2\nReceived\n3\n" == r
else:
assert six.PY3
assert "Input:\n1\nExpected:\n2\nRecieved\n3\n" == r
assert "Input:\n1\nExpected:\n2\nReceived\n3\n" == r
assert input.__repr__.call_count == 1
assert expected.__repr__.call_count == 1

@ -1,12 +1,12 @@
from __future__ import absolute_import, division, unicode_literals
from six import PY2, text_type, unichr
from six import PY2, text_type
import io
from . import support # noqa
from html5lib.constants import namespaces, tokenTypes
from html5lib.constants import namespaces
from html5lib import parse, parseFragment, HTMLParser
@ -53,42 +53,6 @@ def test_unicode_file():
assert parse(io.StringIO("a")) is not None
def test_maintain_attribute_order():
# This is here because we impl it in parser and not tokenizer
p = HTMLParser()
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
token = {'name': 'html',
'selfClosing': False,
'selfClosingAcknowledged': False,
'type': tokenTypes["StartTag"],
'data': attrs}
out = p.normalizeToken(token)
attr_order = list(out["data"].keys())
assert attr_order == [x for x, i in attrs]
def test_duplicate_attribute():
# This is here because we impl it in parser and not tokenizer
doc = parse('<p class=a class=b>')
el = doc[1][0]
assert el.get("class") == "a"
def test_maintain_duplicate_attribute_order():
# This is here because we impl it in parser and not tokenizer
p = HTMLParser()
attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
token = {'name': 'html',
'selfClosing': False,
'selfClosingAcknowledged': False,
'type': tokenTypes["StartTag"],
'data': attrs + [('a', len(attrs))]}
out = p.normalizeToken(token)
attr_order = list(out["data"].keys())
assert attr_order == [x for x, i in attrs]
def test_debug_log():
parser = HTMLParser(debug=True)
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

@ -1,31 +1,22 @@
from __future__ import absolute_import, division, unicode_literals
import pytest
from html5lib import constants, parseFragment, serialize
from html5lib.filters import sanitizer
def runSanitizerTest(_, expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
assert expected == sanitize_html(input)
def sanitize_html(stream):
parsed = parseFragment(stream)
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
with pytest.deprecated_call():
serialized = serialize(parsed,
sanitize=True,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
return serialized
@ -59,7 +50,7 @@ def test_data_uri_disallowed_type():
assert expected == sanitized
def test_sanitizer():
def param_sanitizer():
for ns, tag_name in sanitizer.allowed_elements:
if ns != constants.namespaces["html"]:
continue
@ -67,19 +58,19 @@ def test_sanitizer():
'tfoot', 'th', 'thead', 'tr', 'select']:
continue # TODO
if tag_name == 'image':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
yield ("test_should_allow_%s_tag" % tag_name,
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
elif tag_name == 'br':
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
yield ("test_should_allow_%s_tag" % tag_name,
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
elif tag_name in constants.voidElements:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
yield ("test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
else:
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
yield ("test_should_allow_%s_tag" % tag_name,
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
@ -93,7 +84,7 @@ def test_sanitizer():
attribute_value = 'foo'
if attribute_name in sanitizer.attr_val_is_uri:
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
yield ("test_should_allow_%s_attribute" % attribute_name,
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
@ -101,7 +92,7 @@ def test_sanitizer():
rest_of_uri = '//sub.domain.tld/path/object.ext'
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
yield ("test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
@ -110,11 +101,26 @@ def test_sanitizer():
if protocol == 'data':
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
protocol = protocol.upper()
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
yield ("test_should_allow_uppercase_%s_uris" % protocol,
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
@pytest.mark.parametrize("expected, input",
(pytest.param(expected, input, id=id)
for id, expected, input in param_sanitizer()))
def test_sanitizer(expected, input):
parsed = parseFragment(expected)
expected = serialize(parsed,
omit_optional_tags=False,
use_trailing_solidus=True,
space_before_trailing_solidus=False,
quote_attr_values="always",
quote_char='"',
alphabetical_attributes=True)
assert expected == sanitize_html(input)
def test_lowercase_color_codes_in_style():
sanitized = sanitize_html("<p style=\"border: 1px solid #a2a2a2;\"></p>")
expected = '<p style=\"border: 1px solid #a2a2a2;\"></p>'

@ -80,7 +80,7 @@ class JsonWalker(TreeWalker):
def serialize_html(input, options):
options = dict([(str(k), v) for k, v in options.items()])
options = {str(k): v for k, v in options.items()}
encoding = options.get("encoding", None)
if "encoding" in options:
del options["encoding"]
@ -89,19 +89,6 @@ def serialize_html(input, options):
return serializer.render(stream, encoding)
def runSerializerTest(input, expected, options):
encoding = options.get("encoding", None)
if encoding:
expected = list(map(lambda x: x.encode(encoding), expected))
result = serialize_html(input, options)
if len(expected) == 1:
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
elif result not in expected:
assert False, "Expected: %s, Received: %s" % (expected, result)
def throwsWithLatin1(input):
with pytest.raises(UnicodeEncodeError):
serialize_html(input, {"encoding": "iso-8859-1"})
@ -120,13 +107,13 @@ def testDoctypeSystemId():
def testCdataCharacters():
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
["<style>&amacr;"], {"encoding": "iso-8859-1"})
test_serializer([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
["<style>&amacr;"], {"encoding": "iso-8859-1"})
def testCharacters():
runSerializerTest([["Characters", "\u0101"]],
["&amacr;"], {"encoding": "iso-8859-1"})
test_serializer([["Characters", "\u0101"]],
["&amacr;"], {"encoding": "iso-8859-1"})
def testStartTagName():
@ -138,9 +125,9 @@ def testAttributeName():
def testAttributeValue():
runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "potato", "value": "\u0101"}]]],
["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
test_serializer([["StartTag", "http://www.w3.org/1999/xhtml", "span",
[{"namespace": None, "name": "potato", "value": "\u0101"}]]],
["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
def testEndTagName():
@ -165,7 +152,7 @@ def testSpecQuoteAttribute(c):
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "spec"}
runSerializerTest(input_, output_, options_)
test_serializer(input_, output_, options_)
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
@ -184,7 +171,7 @@ def testLegacyQuoteAttribute(c):
else:
output_ = ['<span foo="%s">' % c]
options_ = {"quote_attr_values": "legacy"}
runSerializerTest(input_, output_, options_)
test_serializer(input_, output_, options_)
@pytest.fixture
@ -217,9 +204,23 @@ def testEntityNoResolve(lxml_parser):
assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
def test_serializer():
def param_serializer():
for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
with open(filename) as fp:
tests = json.load(fp)
for test in tests['tests']:
yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
yield test["input"], test["expected"], test.get("options", {})
@pytest.mark.parametrize("input, expected, options", param_serializer())
def test_serializer(input, expected, options):
encoding = options.get("encoding", None)
if encoding:
expected = list(map(lambda x: x.encode(encoding), expected))
result = serialize_html(input, options)
if len(expected) == 1:
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
elif result not in expected:
assert False, "Expected: %s, Received: %s" % (expected, result)

@ -308,9 +308,11 @@ def test_invalid_codepoints(inp, num):
("'\\uD800\\uD800\\uD800'", 3),
("'a\\uD800a\\uD800a\\uD800a'", 3),
("'\\uDFFF\\uDBFF'", 2),
pytest.mark.skipif(sys.maxunicode == 0xFFFF,
("'\\uDBFF\\uDFFF'", 2),
reason="narrow Python")])
pytest.param(
"'\\uDBFF\\uDFFF'", 2,
marks=pytest.mark.skipif(
sys.maxunicode == 0xFFFF,
reason="narrow Python"))])
def test_invalid_codepoints_surrogates(inp, num):
inp = eval(inp) # pylint:disable=eval-used
fp = StringIO(inp)

@ -0,0 +1,66 @@
from __future__ import absolute_import, division, unicode_literals
import io
from six import unichr, text_type
from html5lib._tokenizer import HTMLTokenizer
from html5lib.constants import tokenTypes
def ignore_parse_errors(toks):
for tok in toks:
if tok['type'] != tokenTypes['ParseError']:
yield tok
def test_maintain_attribute_order():
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">")
toks = HTMLTokenizer(stream)
out = list(ignore_parse_errors(toks))
assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
attrs_tok = out[0]['data']
assert len(attrs_tok) == len(attrs)
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
assert in_name == out_name
assert in_value == out_value
def test_duplicate_attribute():
stream = io.StringIO("<span a=1 a=2 a=3>")
toks = HTMLTokenizer(stream)
out = list(ignore_parse_errors(toks))
assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
attrs_tok = out[0]['data']
assert len(attrs_tok) == 1
assert list(attrs_tok.items()) == [('a', '1')]
def test_maintain_duplicate_attribute_order():
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
stream = io.StringIO("<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + " a=100>")
toks = HTMLTokenizer(stream)
out = list(ignore_parse_errors(toks))
assert len(out) == 1
assert out[0]['type'] == tokenTypes['StartTag']
attrs_tok = out[0]['data']
assert len(attrs_tok) == len(attrs)
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_tok.items()):
assert in_name == out_name
assert in_value == out_value

@ -1,7 +1,9 @@
from __future__ import absolute_import, division, unicode_literals
import itertools
import sys
from six import unichr, text_type
import pytest
try:
@ -61,24 +63,7 @@ def set_attribute_on_first_child(docfrag, name, value, treeName):
setter['ElementTree'](docfrag)(name, value)
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
"""tests what happens when we add attributes to the intext"""
treeName, treeClass = tree
if treeClass is None:
pytest.skip("Treebuilder not loaded")
parser = html5parser.HTMLParser(tree=treeClass["builder"])
document = parser.parseFragment(intext)
for nom, val in attrs_to_add:
set_attribute_on_first_child(document, nom, val, treeName)
document = treeClass.get("adapter", lambda x: x)(document)
output = treewalkers.pprint(treeClass["walker"](document))
output = attrlist.sub(sortattrs, output)
if output not in expected:
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
def test_treewalker_six_mix():
def param_treewalker_six_mix():
"""Str/Unicode mix. If str attrs added to tree"""
# On Python 2.x string literals are of type str. Unless, like this
@ -99,7 +84,25 @@ def test_treewalker_six_mix():
for tree in sorted(treeTypes.items()):
for intext, attrs, expected in sm_tests:
yield runTreewalkerEditTest, intext, expected, attrs, tree
yield intext, expected, attrs, tree
@pytest.mark.parametrize("intext, expected, attrs_to_add, tree", param_treewalker_six_mix())
def test_treewalker_six_mix(intext, expected, attrs_to_add, tree):
"""tests what happens when we add attributes to the intext"""
treeName, treeClass = tree
if treeClass is None:
pytest.skip("Treebuilder not loaded")
parser = html5parser.HTMLParser(tree=treeClass["builder"])
document = parser.parseFragment(intext)
for nom, val in attrs_to_add:
set_attribute_on_first_child(document, nom, val, treeName)
document = treeClass.get("adapter", lambda x: x)(document)
output = treewalkers.pprint(treeClass["walker"](document))
output = attrlist.sub(sortattrs, output)
if output not in expected:
raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
@pytest.mark.parametrize("tree,char", itertools.product(sorted(treeTypes.items()), ["x", "\u1234"]))
@ -134,3 +137,69 @@ def test_lxml_xml():
output = Lint(walker(lxmltree))
assert list(output) == expected
@pytest.mark.parametrize("treeName",
[pytest.param(treeName, marks=[getattr(pytest.mark, treeName),
pytest.mark.skipif(
treeName != "lxml" or
sys.version_info < (3, 7), reason="dict order undef")])
for treeName in sorted(treeTypes.keys())])
def test_maintain_attribute_order(treeName):
treeAPIs = treeTypes[treeName]
if treeAPIs is None:
pytest.skip("Treebuilder not loaded")
# generate loads to maximize the chance a hash-based mutation will occur
attrs = [(unichr(x), text_type(i)) for i, x in enumerate(range(ord('a'), ord('z')))]
data = "<span " + " ".join("%s='%s'" % (x, i) for x, i in attrs) + ">"
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
document = parser.parseFragment(data)
document = treeAPIs.get("adapter", lambda x: x)(document)
output = list(Lint(treeAPIs["walker"](document)))
assert len(output) == 2
assert output[0]['type'] == 'StartTag'
assert output[1]['type'] == "EndTag"
attrs_out = output[0]['data']
assert len(attrs) == len(attrs_out)
for (in_name, in_value), (out_name, out_value) in zip(attrs, attrs_out.items()):
assert (None, in_name) == out_name
assert in_value == out_value
@pytest.mark.parametrize("treeName",
[pytest.param(treeName, marks=[getattr(pytest.mark, treeName),
pytest.mark.skipif(
treeName != "lxml" or
sys.version_info < (3, 7), reason="dict order undef")])
for treeName in sorted(treeTypes.keys())])
def test_maintain_attribute_order_adjusted(treeName):
treeAPIs = treeTypes[treeName]
if treeAPIs is None:
pytest.skip("Treebuilder not loaded")
# generate loads to maximize the chance a hash-based mutation will occur
data = "<svg a=1 refx=2 b=3 xml:lang=4 c=5>"
parser = html5parser.HTMLParser(tree=treeAPIs["builder"])
document = parser.parseFragment(data)
document = treeAPIs.get("adapter", lambda x: x)(document)
output = list(Lint(treeAPIs["walker"](document)))
assert len(output) == 2
assert output[0]['type'] == 'StartTag'
assert output[1]['type'] == "EndTag"
attrs_out = output[0]['data']
assert list(attrs_out.items()) == [((None, 'a'), '1'),
((None, 'refX'), '2'),
((None, 'b'), '3'),
(('http://www.w3.org/XML/1998/namespace', 'lang'), '4'),
((None, 'c'), '5')]

@ -1,34 +0,0 @@
Credits
=======
The ``html5lib`` test data is maintained by:
- James Graham
- Geoffrey Sneddon
Contributors
------------
- Adam Barth
- Andi Sidwell
- Anne van Kesteren
- David Flanagan
- Edward Z. Yang
- Geoffrey Sneddon
- Henri Sivonen
- Ian Hickson
- Jacques Distler
- James Graham
- Lachlan Hunt
- lantis63
- Mark Pilgrim
- Mats Palmgren
- Ms2ger
- Nolan Waite
- Philip Taylor
- Rafael Weinstein
- Ryan King
- Sam Ruby
- Simon Pieters
- Thomas Broyer

@ -1,21 +0,0 @@
Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and
other contributors
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -1,51 +0,0 @@
老子《道德經》 第一~四十章
老子道經
第一章
道可道,非常道。名可名,非常名。無,名天地之始﹔有,名萬物之母。
故常無,欲以觀其妙;常有,欲以觀其徼。此兩者,同出而異名,同謂之
玄。玄之又玄,眾妙之門。
第二章
天下皆知美之為美,斯惡矣﹔皆知善之為善,斯不善矣。故有無相生,難
易相成,長短相形,高下相傾,音聲相和,前後相隨。是以聖人處「無為
」之事,行「不言」之教。萬物作焉而不辭,生而不有,為而不恃,功成
而弗居。夫唯弗居,是以不去。
第三章
不尚賢,使民不爭﹔不貴難得之貨,使民不為盜﹔不見可欲,使民心不亂
。是以「聖人」之治,虛其心,實其腹,弱其志,強其骨。常使民無知無
欲。使夫智者不敢為也。為「無為」,則無不治。
第四章
「道」沖,而用之或不盈。淵兮,似萬物之宗﹔挫其銳,解其紛,和其光
,同其塵﹔湛兮似或存。吾不知誰之子?象帝之先。
第五章
天地不仁,以萬物為芻狗﹔聖人不仁,以百姓為芻狗。天地之間,其猶橐
蘥乎?虛而不屈,動而愈出。多言數窮,不如守中。
第六章
谷神不死,是謂玄牝。玄牝之門,是謂天地根。綿綿若存,用之不勤。
第七章
天長地久。天地所以能長且久者,以其不自生,故能長久。是以聖人後其
身而身先,外其身而身存。非以其無私邪?故能成其私。
第八章
上善若水。水善利萬物而不爭。處眾人之所惡,故幾於道。居善地,心善
淵,與善仁,言善信,政善治,事善能,動善時。夫唯不爭,故無尤。
第九章
持而盈之,不如其已﹔揣而銳之,不可長保。金玉滿堂,莫之能守﹔富貴
而驕,自遺其咎。功遂身退,天之道。

@ -1,10 +0,0 @@
#data
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
<!--京-->
<title>Yahoo! JAPAN</title>
<meta name="description" content="日本最大級のポータルサイト。検索、オークション、ニュース、メール、コミュニティ、ショッピング、など80以上のサービスを展開。あなたの生活をより豊かにする「ライフ・エンジン」を目指していきます。">
<style type="text/css" media="all">
#encoding
euc-jp

File diff suppressed because one or more lines are too long

@ -1,115 +0,0 @@
#data
<meta
#encoding
windows-1252
#data
<
#encoding
windows-1252
#data
<!
#encoding
windows-1252
#data
<meta charset = "
#encoding
windows-1252
#data
<meta charset=euc-jp
#encoding
windows-1252
#data
<meta <meta charset='euc-jp'>
#encoding
euc-jp
#data
<meta charset = 'euc-jp'>
#encoding
euc-jp
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
#encoding
utf-8
#data
<!-- -->
<meta http-equiv="Content-Type" content="text/html; charset=utf
#encoding
windows-1252
#data
<meta http-equiv="Content-Type<meta charset="utf-8">
#encoding
windows-1252
#data
<meta http-equiv="Content-Type" content="text/html; charset='utf-8'">
#encoding
utf-8
#data
<meta http-equiv="Content-Type" content="text/html; charset='utf-8">
#encoding
windows-1252
#data
<meta
#encoding
windows-1252
#data
<meta charset =
#encoding
windows-1252
#data
<meta charset= utf-8
>
#encoding
utf-8
#data
<meta content = "text/html;
#encoding
windows-1252
#data
<meta charset="UTF-16">
#encoding
utf-8
#data
<meta charset="UTF-16LE">
#encoding
utf-8
#data
<meta charset="UTF-16BE">
#encoding
utf-8
#data
<html a=ñ>
<meta charset="utf-8">
#encoding
utf-8
#data
<html ñ>
<meta charset="utf-8">
#encoding
utf-8
#data
<html>ñ
<meta charset="utf-8">
#encoding
utf-8

@ -1,125 +0,0 @@
{"tests": [
{"description": "proper attribute value escaping",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test \"with\" &quot;"}]]],
"expected": ["<span title='test \"with\" &amp;quot;'>"]
},
{"description": "proper attribute value non-quoting",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo"}]]],
"expected": ["<span title=foo>"],
"xhtml": ["<span title=\"foo\">"]
},
{"description": "proper attribute value non-quoting (with <)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo<bar"}]]],
"expected": ["<span title=foo<bar>"],
"xhtml": ["<span title=\"foo&lt;bar\">"]
},
{"description": "proper attribute value quoting (with =)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo=bar"}]]],
"expected": ["<span title=\"foo=bar\">"]
},
{"description": "proper attribute value quoting (with >)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo>bar"}]]],
"expected": ["<span title=\"foo>bar\">"]
},
{"description": "proper attribute value quoting (with \")",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\"bar"}]]],
"expected": ["<span title='foo\"bar'>"]
},
{"description": "proper attribute value quoting (with ')",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar"}]]],
"expected": ["<span title=\"foo'bar\">"]
},
{"description": "proper attribute value quoting (with both \" and ')",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar\"baz"}]]],
"expected": ["<span title=\"foo'bar&quot;baz\">"]
},
{"description": "proper attribute value quoting (with space)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo bar"}]]],
"expected": ["<span title=\"foo bar\">"]
},
{"description": "proper attribute value quoting (with tab)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\tbar"}]]],
"expected": ["<span title=\"foo\tbar\">"]
},
{"description": "proper attribute value quoting (with LF)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\nbar"}]]],
"expected": ["<span title=\"foo\nbar\">"]
},
{"description": "proper attribute value quoting (with CR)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\rbar"}]]],
"expected": ["<span title=\"foo\rbar\">"]
},
{"description": "proper attribute value non-quoting (with linetab)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Bbar"}]]],
"expected": ["<span title=foo\u000Bbar>"],
"xhtml": ["<span title=\"foo\u000Bbar\">"]
},
{"description": "proper attribute value quoting (with form feed)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Cbar"}]]],
"expected": ["<span title=\"foo\u000Cbar\">"]
},
{"description": "void element (as EmptyTag token)",
"input": [["EmptyTag", "img", {}]],
"expected": ["<img>"],
"xhtml": ["<img />"]
},
{"description": "void element (as StartTag token)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "img", {}]],
"expected": ["<img>"],
"xhtml": ["<img />"]
},
{"description": "doctype in error",
"input": [["Doctype", "foo"]],
"expected": ["<!DOCTYPE foo>"]
},
{"description": "character data",
"options": {"encoding":"utf-8"},
"input": [["Characters", "a<b>c&d"]],
"expected": ["a&lt;b&gt;c&amp;d"]
},
{"description": "rcdata",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a<b>c&d"],
"xhtml": ["<script>a&lt;b&gt;c&amp;d"]
},
{"description": "doctype",
"input": [["Doctype", "HTML"]],
"expected": ["<!DOCTYPE HTML>"]
},
{"description": "HTML 4.01 DOCTYPE",
"input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd"]],
"expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"]
},
{"description": "HTML 4.01 DOCTYPE without system identifer",
"input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN"]],
"expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"]
},
{"description": "IBM DOCTYPE without public identifer",
"input": [["Doctype", "html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]],
"expected": ["<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"]
}
]}

@ -1,66 +0,0 @@
{"tests": [
{"description": "no encoding",
"options": {"inject_meta_charset": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": [""],
"xhtml": ["<head></head>"]
},
{"description": "empytag head",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/title",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml","title",{}], ["Characters", "foo"],["EndTag", "http://www.w3.org/1999/xhtml", "title"], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8><title>foo</title>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><title>foo</title></head>"]
},
{"description": "head w/meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8>"],
"xhtml": ["<head><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ two meta-charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8><meta charset=utf-8>", "<head><meta charset=utf-8><meta charset=ascii>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta charset=\"utf-8\" /></head>", "<head><meta charset=\"utf-8\" /><meta charset=\"ascii\" /></head>"]
},
{"description": "head w/robots",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta charset=utf-8><meta content=noindex name=robots>"],
"xhtml": ["<head><meta charset=\"utf-8\" /><meta content=\"noindex\" name=\"robots\" /></head>"]
},
{"description": "head w/robots & charset",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta content=noindex name=robots><meta charset=utf-8>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta charset=\"utf-8\" /></head>"]
},
{"description": "head w/ charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
},
{"description": "head w/robots & charset in http-equiv content-type",
"options": {"inject_meta_charset": true, "encoding":"utf-8"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": ["<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
"xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
}
]}

@ -1,965 +0,0 @@
{"tests": [
{"description": "html start-tag followed by text, with attributes",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", [{"namespace": null, "name": "lang", "value": "en"}]], ["Characters", "foo"]],
"expected": ["<html lang=en>foo"]
},
{"description": "html start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Comment", "foo"]],
"expected": ["<html><!--foo-->"]
},
{"description": "html start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", " foo"]],
"expected": ["<html> foo"]
},
{"description": "html start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "html start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "html start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "html start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}]],
"expected": [""]
},
{"description": "html end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Comment", "foo"]],
"expected": ["</html><!--foo-->"]
},
{"description": "html end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", " foo"]],
"expected": ["</html> foo"]
},
{"description": "html end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "html end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "html end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "html end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"]],
"expected": [""]
},
{"description": "head start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Comment", "foo"]],
"expected": ["<head><!--foo-->"]
},
{"description": "head start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", " foo"]],
"expected": ["<head> foo"]
},
{"description": "head start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", "foo"]],
"expected": ["<head>foo"]
},
{"description": "head start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head start-tag followed by end-tag (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["<head></foo>", "</foo>"]
},
{"description": "empty head element",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": [""]
},
{"description": "head start-tag followed by empty-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}]],
"expected": ["<head>", ""]
},
{"description": "head end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Comment", "foo"]],
"expected": ["</head><!--foo-->"]
},
{"description": "head end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", " foo"]],
"expected": ["</head> foo"]
},
{"description": "head end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "head end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "head end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "head end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
"expected": [""]
},
{"description": "body start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Comment", "foo"]],
"expected": ["<body><!--foo-->"]
},
{"description": "body start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", " foo"]],
"expected": ["<body> foo"]
},
{"description": "body start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "body start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "body start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "body start-tag at EOF (shouldn't ever happen?!)",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}]],
"expected": [""]
},
{"description": "body end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Comment", "foo"]],
"expected": ["</body><!--foo-->"]
},
{"description": "body end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", " foo"]],
"expected": ["</body> foo"]
},
{"description": "body end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "body end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "body end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "body end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"]],
"expected": [""]
},
{"description": "li end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Comment", "foo"]],
"expected": ["</li><!--foo-->"]
},
{"description": "li end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", " foo"]],
"expected": ["</li> foo"]
},
{"description": "li end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", "foo"]],
"expected": ["</li>foo"]
},
{"description": "li end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</li><foo>"]
},
{"description": "li end-tag followed by li start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "li", {}]],
"expected": ["<li>"]
},
{"description": "li end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "li end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"]],
"expected": [""]
},
{"description": "dt end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Comment", "foo"]],
"expected": ["</dt><!--foo-->"]
},
{"description": "dt end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", " foo"]],
"expected": ["</dt> foo"]
},
{"description": "dt end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", "foo"]],
"expected": ["</dt>foo"]
},
{"description": "dt end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</dt><foo>"]
},
{"description": "dt end-tag followed by dt start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
"expected": ["<dt>"]
},
{"description": "dt end-tag followed by dd start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
"expected": ["<dd>"]
},
{"description": "dt end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</dt></foo>"]
},
{"description": "dt end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"]],
"expected": ["</dt>"]
},
{"description": "dd end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Comment", "foo"]],
"expected": ["</dd><!--foo-->"]
},
{"description": "dd end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", " foo"]],
"expected": ["</dd> foo"]
},
{"description": "dd end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", "foo"]],
"expected": ["</dd>foo"]
},
{"description": "dd end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</dd><foo>"]
},
{"description": "dd end-tag followed by dd start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
"expected": ["<dd>"]
},
{"description": "dd end-tag followed by dt start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
"expected": ["<dt>"]
},
{"description": "dd end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "dd end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"]],
"expected": [""]
},
{"description": "p end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Comment", "foo"]],
"expected": ["</p><!--foo-->"]
},
{"description": "p end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", " foo"]],
"expected": ["</p> foo"]
},
{"description": "p end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", "foo"]],
"expected": ["</p>foo"]
},
{"description": "p end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</p><foo>"]
},
{"description": "p end-tag followed by address start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "address", {}]],
"expected": ["<address>"]
},
{"description": "p end-tag followed by article start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "article", {}]],
"expected": ["<article>"]
},
{"description": "p end-tag followed by aside start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "aside", {}]],
"expected": ["<aside>"]
},
{"description": "p end-tag followed by blockquote start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "blockquote", {}]],
"expected": ["<blockquote>"]
},
{"description": "p end-tag followed by datagrid start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "datagrid", {}]],
"expected": ["<datagrid>"]
},
{"description": "p end-tag followed by dialog start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dialog", {}]],
"expected": ["<dialog>"]
},
{"description": "p end-tag followed by dir start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dir", {}]],
"expected": ["<dir>"]
},
{"description": "p end-tag followed by div start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
"expected": ["<div>"]
},
{"description": "p end-tag followed by dl start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dl", {}]],
"expected": ["<dl>"]
},
{"description": "p end-tag followed by fieldset start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "fieldset", {}]],
"expected": ["<fieldset>"]
},
{"description": "p end-tag followed by footer start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "footer", {}]],
"expected": ["<footer>"]
},
{"description": "p end-tag followed by form start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "form", {}]],
"expected": ["<form>"]
},
{"description": "p end-tag followed by h1 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h1", {}]],
"expected": ["<h1>"]
},
{"description": "p end-tag followed by h2 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h2", {}]],
"expected": ["<h2>"]
},
{"description": "p end-tag followed by h3 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h3", {}]],
"expected": ["<h3>"]
},
{"description": "p end-tag followed by h4 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h4", {}]],
"expected": ["<h4>"]
},
{"description": "p end-tag followed by h5 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h5", {}]],
"expected": ["<h5>"]
},
{"description": "p end-tag followed by h6 start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h6", {}]],
"expected": ["<h6>"]
},
{"description": "p end-tag followed by header start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "header", {}]],
"expected": ["<header>"]
},
{"description": "p end-tag followed by hr empty-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EmptyTag", "hr", {}]],
"expected": ["<hr>"]
},
{"description": "p end-tag followed by menu start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "menu", {}]],
"expected": ["<menu>"]
},
{"description": "p end-tag followed by nav start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "nav", {}]],
"expected": ["<nav>"]
},
{"description": "p end-tag followed by ol start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ol", {}]],
"expected": ["<ol>"]
},
{"description": "p end-tag followed by p start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "p", {}]],
"expected": ["<p>"]
},
{"description": "p end-tag followed by pre start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}]],
"expected": ["<pre>"]
},
{"description": "p end-tag followed by section start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "section", {}]],
"expected": ["<section>"]
},
{"description": "p end-tag followed by table start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "table", {}]],
"expected": ["<table>"]
},
{"description": "p end-tag followed by ul start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ul", {}]],
"expected": ["<ul>"]
},
{"description": "p end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "p end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"]],
"expected": [""]
},
{"description": "optgroup end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Comment", "foo"]],
"expected": ["</optgroup><!--foo-->"]
},
{"description": "optgroup end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", " foo"]],
"expected": ["</optgroup> foo"]
},
{"description": "optgroup end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", "foo"]],
"expected": ["</optgroup>foo"]
},
{"description": "optgroup end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</optgroup><foo>"]
},
{"description": "optgroup end-tag followed by optgroup start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
"expected": ["<optgroup>"]
},
{"description": "optgroup end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "optgroup end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"]],
"expected": [""]
},
{"description": "option end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Comment", "foo"]],
"expected": ["</option><!--foo-->"]
},
{"description": "option end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", " foo"]],
"expected": ["</option> foo"]
},
{"description": "option end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", "foo"]],
"expected": ["</option>foo"]
},
{"description": "option end-tag followed by optgroup start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
"expected": ["<optgroup>"]
},
{"description": "option end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</option><foo>"]
},
{"description": "option end-tag followed by option start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "option", {}]],
"expected": ["<option>"]
},
{"description": "option end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "option end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"]],
"expected": [""]
},
{"description": "colgroup start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Comment", "foo"]],
"expected": ["<colgroup><!--foo-->"]
},
{"description": "colgroup start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", " foo"]],
"expected": ["<colgroup> foo"]
},
{"description": "colgroup start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", "foo"]],
"expected": ["<colgroup>foo"]
},
{"description": "colgroup start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<colgroup><foo>"]
},
{"description": "first colgroup in a table with a col child",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EmptyTag", "col", {}]],
"expected": ["<table><col>"]
},
{"description": "colgroup with a col child, following another colgroup",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "col", {}]],
"expected": ["</colgroup><col>", "<colgroup><col>"]
},
{"description": "colgroup start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["<colgroup></foo>"]
},
{"description": "colgroup start-tag at EOF",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}]],
"expected": ["<colgroup>"]
},
{"description": "colgroup end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Comment", "foo"]],
"expected": ["</colgroup><!--foo-->"]
},
{"description": "colgroup end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", " foo"]],
"expected": ["</colgroup> foo"]
},
{"description": "colgroup end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", "foo"]],
"expected": ["foo"]
},
{"description": "colgroup end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<foo>"]
},
{"description": "colgroup end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "colgroup end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"]],
"expected": [""]
},
{"description": "thead end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Comment", "foo"]],
"expected": ["</thead><!--foo-->"]
},
{"description": "thead end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", " foo"]],
"expected": ["</thead> foo"]
},
{"description": "thead end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", "foo"]],
"expected": ["</thead>foo"]
},
{"description": "thead end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</thead><foo>"]
},
{"description": "thead end-tag followed by tbody start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>"]
},
{"description": "thead end-tag followed by tfoot start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
"expected": ["<tfoot>"]
},
{"description": "thead end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</thead></foo>"]
},
{"description": "thead end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"]],
"expected": ["</thead>"]
},
{"description": "tbody start-tag followed by comment",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Comment", "foo"]],
"expected": ["<tbody><!--foo-->"]
},
{"description": "tbody start-tag followed by space character",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", " foo"]],
"expected": ["<tbody> foo"]
},
{"description": "tbody start-tag followed by text",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", "foo"]],
"expected": ["<tbody>foo"]
},
{"description": "tbody start-tag followed by start-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["<tbody><foo>"]
},
{"description": "first tbody in a table with a tr child",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<table><tr>"]
},
{"description": "tbody with a tr child, following another tbody",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tbody><tr>", "</tbody><tr>"]
},
{"description": "tbody with a tr child, following a thead",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tbody><tr>", "</thead><tr>"]
},
{"description": "tbody with a tr child, following a tfoot",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tbody><tr>", "</tfoot><tr>"]
},
{"description": "tbody start-tag followed by end-tag",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["<tbody></foo>"]
},
{"description": "tbody start-tag at EOF",
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>"]
},
{"description": "tbody end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Comment", "foo"]],
"expected": ["</tbody><!--foo-->"]
},
{"description": "tbody end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", " foo"]],
"expected": ["</tbody> foo"]
},
{"description": "tbody end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", "foo"]],
"expected": ["</tbody>foo"]
},
{"description": "tbody end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</tbody><foo>"]
},
{"description": "tbody end-tag followed by tbody start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>", "</tbody>"]
},
{"description": "tbody end-tag followed by tfoot start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
"expected": ["<tfoot>"]
},
{"description": "tbody end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "tbody end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"]],
"expected": [""]
},
{"description": "tfoot end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Comment", "foo"]],
"expected": ["</tfoot><!--foo-->"]
},
{"description": "tfoot end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", " foo"]],
"expected": ["</tfoot> foo"]
},
{"description": "tfoot end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", "foo"]],
"expected": ["</tfoot>foo"]
},
{"description": "tfoot end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</tfoot><foo>"]
},
{"description": "tfoot end-tag followed by tbody start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
"expected": ["<tbody>", "</tfoot>"]
},
{"description": "tfoot end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "tfoot end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"]],
"expected": [""]
},
{"description": "tr end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Comment", "foo"]],
"expected": ["</tr><!--foo-->"]
},
{"description": "tr end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", " foo"]],
"expected": ["</tr> foo"]
},
{"description": "tr end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", "foo"]],
"expected": ["</tr>foo"]
},
{"description": "tr end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</tr><foo>"]
},
{"description": "tr end-tag followed by tr start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
"expected": ["<tr>", "</tr>"]
},
{"description": "tr end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "tr end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"]],
"expected": [""]
},
{"description": "td end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Comment", "foo"]],
"expected": ["</td><!--foo-->"]
},
{"description": "td end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", " foo"]],
"expected": ["</td> foo"]
},
{"description": "td end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", "foo"]],
"expected": ["</td>foo"]
},
{"description": "td end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</td><foo>"]
},
{"description": "td end-tag followed by td start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
"expected": ["<td>", "</td>"]
},
{"description": "td end-tag followed by th start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
"expected": ["<th>", "</td>"]
},
{"description": "td end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "td end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"]],
"expected": [""]
},
{"description": "th end-tag followed by comment",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Comment", "foo"]],
"expected": ["</th><!--foo-->"]
},
{"description": "th end-tag followed by space character",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", " foo"]],
"expected": ["</th> foo"]
},
{"description": "th end-tag followed by text",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", "foo"]],
"expected": ["</th>foo"]
},
{"description": "th end-tag followed by start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
"expected": ["</th><foo>"]
},
{"description": "th end-tag followed by th start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
"expected": ["<th>", "</th>"]
},
{"description": "th end-tag followed by td start-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
"expected": ["<td>", "</th>"]
},
{"description": "th end-tag followed by end-tag",
"input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
"expected": ["</foo>"]
},
{"description": "th end-tag at EOF",
"input": [["EndTag", "http://www.w3.org/1999/xhtml" , "th"]],
"expected": [""]
}
]}

@ -1,60 +0,0 @@
{"tests":[
{"description": "quote_char=\"'\"",
"options": {"quote_char": "'"},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test 'with' quote_char"}]]],
"expected": ["<span title='test &#39;with&#39; quote_char'>"]
},
{"description": "quote_attr_values=true",
"options": {"quote_attr_values": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "button", [{"namespace": null, "name": "disabled", "value" :"disabled"}]]],
"expected": ["<button disabled>"],
"xhtml": ["<button disabled=\"disabled\">"]
},
{"description": "quote_attr_values=true with irrelevant",
"options": {"quote_attr_values": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
"expected": ["<div irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
},
{"description": "use_trailing_solidus=true with void element",
"options": {"use_trailing_solidus": true},
"input": [["EmptyTag", "img", {}]],
"expected": ["<img />"]
},
{"description": "use_trailing_solidus=true with non-void element",
"options": {"use_trailing_solidus": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
"expected": ["<div>"]
},
{"description": "minimize_boolean_attributes=false",
"options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
"expected": ["<div irrelevant=irrelevant>"],
"xhtml": ["<div irrelevant=\"irrelevant\">"]
},
{"description": "minimize_boolean_attributes=false with empty value",
"options": {"minimize_boolean_attributes": false},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :""}]]],
"expected": ["<div irrelevant=\"\">"]
},
{"description": "escape less than signs in attribute values",
"options": {"escape_lt_in_attrs": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "a", [{"namespace": null, "name": "title", "value": "a<b>c&d"}]]],
"expected": ["<a title=\"a&lt;b>c&amp;d\">"]
},
{"description": "rcdata",
"options": {"escape_rcdata": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
"expected": ["<script>a&lt;b&gt;c&amp;d"]
}
]}

@ -1,51 +0,0 @@
{"tests": [
{"description": "bare text with leading spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "\t\r\n\u000C foo"]],
"expected": [" foo"]
},
{"description": "bare text with trailing spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000C"]],
"expected": ["foo "]
},
{"description": "bare text with inner spaces",
"options": {"strip_whitespace": true},
"input": [["Characters", "foo \t\r\n\u000C bar"]],
"expected": ["foo bar"]
},
{"description": "text within <pre>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
"expected": ["<pre>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</pre>"]
},
{"description": "text within <pre>, with inner markup",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C fo"], ["StartTag", "http://www.w3.org/1999/xhtml", "span", {}], ["Characters", "o \t\r\n\u000C b"], ["EndTag", "http://www.w3.org/1999/xhtml", "span"], ["Characters", "ar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
"expected": ["<pre>\t\r\n\u000C fo<span>o \t\r\n\u000C b</span>ar \t\r\n\u000C</pre>"]
},
{"description": "text within <textarea>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "textarea", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "textarea"]],
"expected": ["<textarea>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</textarea>"]
},
{"description": "text within <script>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "script"]],
"expected": ["<script>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</script>"]
},
{"description": "text within <style>",
"options": {"strip_whitespace": true},
"input": [["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "style"]],
"expected": ["<style>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</style>"]
}
]}

@ -1,104 +0,0 @@
Tokenizer tests
===============
The test format is [JSON](http://www.json.org/). This has the advantage
that the syntax allows backward-compatible extensions to the tests and
the disadvantage that it is relatively verbose.
Basic Structure
---------------
{"tests": [
    {"description": "Test description",
    "input": "input_string",
    "output": [expected_output_tokens],
    "initialStates": [initial_states],
    "lastStartTag": last_start_tag,
    "ignoreErrorOrder": ignore_error_order
    }
]}
Multiple tests per file are allowed simply by adding more objects to the
"tests" list.
`description`, `input` and `output` are always present. The other values
are optional.
### Test set-up
`test.input` is a string containing the characters to pass to the
tokenizer. Specifically, it represents the characters of the **input
stream**, and so implementations are expected to perform the processing
described in the spec's **Preprocessing the input stream** section
before feeding the result to the tokenizer.
If `test.doubleEscaped` is present and `true`, then `test.input` is not
quite as described above. Instead, it must first be subjected to another
round of unescaping (i.e., in addition to any unescaping involved in the
JSON import), and the result of *that* represents the characters of the
input stream. Currently, the only unescaping required by this option is
to convert each sequence of the form \\uHHHH (where H is a hex digit)
into the corresponding Unicode code point. (Note that this option also
affects the interpretation of `test.output`.)
`test.initialStates` is a list of strings, each being the name of a
tokenizer state. The test should be run once for each string, using it
to set the tokenizer's initial state for that run. If
`test.initialStates` is omitted, it defaults to `["data state"]`.
`test.lastStartTag` is a lowercase string that should be used as "the
tag name of the last start tag to have been emitted from this
tokenizer", referenced in the spec's definition of **appropriate end tag
token**. If it is omitted, it is treated as if "no start tag has been
emitted from this tokenizer".
### Test results
`test.output` is a list of tokens, ordered with the first produced by
the tokenizer the first (leftmost) in the list. The list must mach the
**complete** list of tokens that the tokenizer should produce. Valid
tokens are:
["DOCTYPE", name, public_id, system_id, correctness]
["StartTag", name, {attributes}*, true*]
["StartTag", name, {attributes}]
["EndTag", name]
["Comment", data]
["Character", data]
"ParseError"
`public_id` and `system_id` are either strings or `null`. `correctness`
is either `true` or `false`; `true` corresponds to the force-quirks flag
being false, and vice-versa.
When the self-closing flag is set, the `StartTag` array has `true` as
its fourth entry. When the flag is not set, the array has only three
entries for backwards compatibility.
All adjacent character tokens are coalesced into a single
`["Character", data]` token.
If `test.doubleEscaped` is present and `true`, then every string within
`test.output` must be further unescaped (as described above) before
comparing with the tokenizer's output.
`test.ignoreErrorOrder` is a boolean value indicating that the order of
`ParseError` tokens relative to other tokens in the output stream is
unimportant, and implementations should ignore such differences between
their output and `expected_output_tokens`. (This is used for errors
emitted by the input stream preprocessing stage, since it is useful to
test that code but it is undefined when the errors occur). If it is
omitted, it defaults to `false`.
xmlViolation tests
------------------
`tokenizer/xmlViolation.test` differs from the above in a couple of
ways:
- The name of the single member of the top-level JSON object is
"xmlViolationTests" instead of "tests".
- Each test's expected output assumes that implementation is applying
the tweaks given in the spec's "Coercing an HTML DOM into an
infoset" section.

@ -1,81 +0,0 @@
{"tests": [
{"description":"PLAINTEXT content model flag",
"initialStates":["PLAINTEXT state"],
"lastStartTag":"plaintext",
"input":"<head>&body;",
"output":[["Character", "<head>&body;"]]},
{"description":"End tag closing RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp>",
"output":[["Character", "foo"], ["EndTag", "xmp"]]},
{"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xMp>",
"output":[["Character", "foo"], ["EndTag", "xmp"]]},
{"description":"End tag closing RCDATA or RAWTEXT (ending with space)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp ",
"output":[["Character", "foo"], "ParseError"]},
{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp",
"output":[["Character", "foo</xmp"]]},
{"description":"End tag closing RCDATA or RAWTEXT (ending with slash)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp/",
"output":[["Character", "foo"], "ParseError"]},
{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp<",
"output":[["Character", "foo</xmp<"]]},
{"description":"End tag with incorrect name in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</foo>bar</xmp>",
"output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]},
{"description":"Partial end tags leading straight into partial end tags",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</xmp</xmp</xmp>",
"output":[["Character", "</xmp</xmp"], ["EndTag", "xmp"]]},
{"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</foo>bar</xmpaar>",
"output":[["Character", "</foo>bar</xmpaar>"]]},
{"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo</xmp></baz>",
"output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]},
{"description":"RAWTEXT w/ something looking like an entity",
"initialStates":["RAWTEXT state"],
"lastStartTag":"xmp",
"input":"&foo;",
"output":[["Character", "&foo;"]]},
{"description":"RCDATA w/ an entity",
"initialStates":["RCDATA state"],
"lastStartTag":"textarea",
"input":"&lt;",
"output":[["Character", "<"]]}
]}

@ -1,96 +0,0 @@
{
"tests": [
{
"description":"CR in bogus comment state",
"input":"<?\u000d",
"output":["ParseError", ["Comment", "?\u000a"]]
},
{
"description":"CRLF in bogus comment state",
"input":"<?\u000d\u000a",
"output":["ParseError", ["Comment", "?\u000a"]]
},
{
"description":"CRLFLF in bogus comment state",
"input":"<?\u000d\u000a\u000a",
"output":["ParseError", ["Comment", "?\u000a\u000a"]]
},
{
"description":"NUL in RCDATA and RAWTEXT",
"doubleEscaped":true,
"initialStates":["RCDATA state", "RAWTEXT state"],
"input":"\\u0000",
"output":["ParseError", ["Character", "\\uFFFD"]]
},
{
"description":"leading U+FEFF must pass through",
"doubleEscaped":true,
"input":"\\uFEFFfoo\\uFEFFbar",
"output":[["Character", "\\uFEFFfoo\\uFEFFbar"]]
},
{
"description":"Non BMP-charref in in RCDATA",
"initialStates":["RCDATA state"],
"input":"&NotEqualTilde;",
"output":[["Character", "\u2242\u0338"]]
},
{
"description":"Bad charref in in RCDATA",
"initialStates":["RCDATA state"],
"input":"&NotEqualTild;",
"output":["ParseError", ["Character", "&NotEqualTild;"]]
},
{
"description":"lowercase endtags in RCDATA and RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</XMP>",
"output":[["EndTag","xmp"]]
},
{
"description":"bad endtag in RCDATA and RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</ XMP>",
"output":[["Character","</ XMP>"]]
},
{
"description":"bad endtag in RCDATA and RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</xm>",
"output":[["Character","</xm>"]]
},
{
"description":"bad endtag in RCDATA and RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</xm ",
"output":[["Character","</xm "]]
},
{
"description":"bad endtag in RCDATA and RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"</xm/",
"output":[["Character","</xm/"]]
},
{
"description":"Non BMP-charref in attribute",
"input":"<p id=\"&NotEqualTilde;\">",
"output":[["StartTag", "p", {"id":"\u2242\u0338"}]]
},
{
"description":"--!NUL in comment ",
"doubleEscaped":true,
"input":"<!----!\\u0000-->",
"output":["ParseError", "ParseError", ["Comment", "--!\\uFFFD"]]
},
{
"description":"space EOF after doctype ",
"input":"<!DOCTYPE html ",
"output":["ParseError", ["DOCTYPE", "html", null, null , false]]
}
]
}

@ -1,283 +0,0 @@
{"tests": [
{"description": "Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.",
"input":"<h a='&noti;'>",
"output": [["StartTag", "h", {"a": "&noti;"}]]},
{"description": "Entity name followed by the equals sign in an attribute value.",
"input":"<h a='&lang='>",
"output": [["StartTag", "h", {"a": "&lang="}]]},
{"description": "CR as numeric entity",
"input":"&#013;",
"output": ["ParseError", ["Character", "\r"]]},
{"description": "CR as hexadecimal numeric entity",
"input":"&#x00D;",
"output": ["ParseError", ["Character", "\r"]]},
{"description": "Windows-1252 EURO SIGN numeric entity.",
"input":"&#0128;",
"output": ["ParseError", ["Character", "\u20AC"]]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0129;",
"output": ["ParseError", ["Character", "\u0081"]]},
{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK numeric entity.",
"input":"&#0130;",
"output": ["ParseError", ["Character", "\u201A"]]},
{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK numeric entity.",
"input":"&#0131;",
"output": ["ParseError", ["Character", "\u0192"]]},
{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK numeric entity.",
"input":"&#0132;",
"output": ["ParseError", ["Character", "\u201E"]]},
{"description": "Windows-1252 HORIZONTAL ELLIPSIS numeric entity.",
"input":"&#0133;",
"output": ["ParseError", ["Character", "\u2026"]]},
{"description": "Windows-1252 DAGGER numeric entity.",
"input":"&#0134;",
"output": ["ParseError", ["Character", "\u2020"]]},
{"description": "Windows-1252 DOUBLE DAGGER numeric entity.",
"input":"&#0135;",
"output": ["ParseError", ["Character", "\u2021"]]},
{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT numeric entity.",
"input":"&#0136;",
"output": ["ParseError", ["Character", "\u02C6"]]},
{"description": "Windows-1252 PER MILLE SIGN numeric entity.",
"input":"&#0137;",
"output": ["ParseError", ["Character", "\u2030"]]},
{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON numeric entity.",
"input":"&#0138;",
"output": ["ParseError", ["Character", "\u0160"]]},
{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK numeric entity.",
"input":"&#0139;",
"output": ["ParseError", ["Character", "\u2039"]]},
{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE numeric entity.",
"input":"&#0140;",
"output": ["ParseError", ["Character", "\u0152"]]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0141;",
"output": ["ParseError", ["Character", "\u008D"]]},
{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON numeric entity.",
"input":"&#0142;",
"output": ["ParseError", ["Character", "\u017D"]]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0143;",
"output": ["ParseError", ["Character", "\u008F"]]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0144;",
"output": ["ParseError", ["Character", "\u0090"]]},
{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK numeric entity.",
"input":"&#0145;",
"output": ["ParseError", ["Character", "\u2018"]]},
{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK numeric entity.",
"input":"&#0146;",
"output": ["ParseError", ["Character", "\u2019"]]},
{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK numeric entity.",
"input":"&#0147;",
"output": ["ParseError", ["Character", "\u201C"]]},
{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK numeric entity.",
"input":"&#0148;",
"output": ["ParseError", ["Character", "\u201D"]]},
{"description": "Windows-1252 BULLET numeric entity.",
"input":"&#0149;",
"output": ["ParseError", ["Character", "\u2022"]]},
{"description": "Windows-1252 EN DASH numeric entity.",
"input":"&#0150;",
"output": ["ParseError", ["Character", "\u2013"]]},
{"description": "Windows-1252 EM DASH numeric entity.",
"input":"&#0151;",
"output": ["ParseError", ["Character", "\u2014"]]},
{"description": "Windows-1252 SMALL TILDE numeric entity.",
"input":"&#0152;",
"output": ["ParseError", ["Character", "\u02DC"]]},
{"description": "Windows-1252 TRADE MARK SIGN numeric entity.",
"input":"&#0153;",
"output": ["ParseError", ["Character", "\u2122"]]},
{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON numeric entity.",
"input":"&#0154;",
"output": ["ParseError", ["Character", "\u0161"]]},
{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK numeric entity.",
"input":"&#0155;",
"output": ["ParseError", ["Character", "\u203A"]]},
{"description": "Windows-1252 LATIN SMALL LIGATURE OE numeric entity.",
"input":"&#0156;",
"output": ["ParseError", ["Character", "\u0153"]]},
{"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
"input":"&#0157;",
"output": ["ParseError", ["Character", "\u009D"]]},
{"description": "Windows-1252 EURO SIGN hexadecimal numeric entity.",
"input":"&#x080;",
"output": ["ParseError", ["Character", "\u20AC"]]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x081;",
"output": ["ParseError", ["Character", "\u0081"]]},
{"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x082;",
"output": ["ParseError", ["Character", "\u201A"]]},
{"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK hexadecimal numeric entity.",
"input":"&#x083;",
"output": ["ParseError", ["Character", "\u0192"]]},
{"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x084;",
"output": ["ParseError", ["Character", "\u201E"]]},
{"description": "Windows-1252 HORIZONTAL ELLIPSIS hexadecimal numeric entity.",
"input":"&#x085;",
"output": ["ParseError", ["Character", "\u2026"]]},
{"description": "Windows-1252 DAGGER hexadecimal numeric entity.",
"input":"&#x086;",
"output": ["ParseError", ["Character", "\u2020"]]},
{"description": "Windows-1252 DOUBLE DAGGER hexadecimal numeric entity.",
"input":"&#x087;",
"output": ["ParseError", ["Character", "\u2021"]]},
{"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT hexadecimal numeric entity.",
"input":"&#x088;",
"output": ["ParseError", ["Character", "\u02C6"]]},
{"description": "Windows-1252 PER MILLE SIGN hexadecimal numeric entity.",
"input":"&#x089;",
"output": ["ParseError", ["Character", "\u2030"]]},
{"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON hexadecimal numeric entity.",
"input":"&#x08A;",
"output": ["ParseError", ["Character", "\u0160"]]},
{"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x08B;",
"output": ["ParseError", ["Character", "\u2039"]]},
{"description": "Windows-1252 LATIN CAPITAL LIGATURE OE hexadecimal numeric entity.",
"input":"&#x08C;",
"output": ["ParseError", ["Character", "\u0152"]]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x08D;",
"output": ["ParseError", ["Character", "\u008D"]]},
{"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON hexadecimal numeric entity.",
"input":"&#x08E;",
"output": ["ParseError", ["Character", "\u017D"]]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x08F;",
"output": ["ParseError", ["Character", "\u008F"]]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x090;",
"output": ["ParseError", ["Character", "\u0090"]]},
{"description": "Windows-1252 LEFT SINGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x091;",
"output": ["ParseError", ["Character", "\u2018"]]},
{"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x092;",
"output": ["ParseError", ["Character", "\u2019"]]},
{"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x093;",
"output": ["ParseError", ["Character", "\u201C"]]},
{"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x094;",
"output": ["ParseError", ["Character", "\u201D"]]},
{"description": "Windows-1252 BULLET hexadecimal numeric entity.",
"input":"&#x095;",
"output": ["ParseError", ["Character", "\u2022"]]},
{"description": "Windows-1252 EN DASH hexadecimal numeric entity.",
"input":"&#x096;",
"output": ["ParseError", ["Character", "\u2013"]]},
{"description": "Windows-1252 EM DASH hexadecimal numeric entity.",
"input":"&#x097;",
"output": ["ParseError", ["Character", "\u2014"]]},
{"description": "Windows-1252 SMALL TILDE hexadecimal numeric entity.",
"input":"&#x098;",
"output": ["ParseError", ["Character", "\u02DC"]]},
{"description": "Windows-1252 TRADE MARK SIGN hexadecimal numeric entity.",
"input":"&#x099;",
"output": ["ParseError", ["Character", "\u2122"]]},
{"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON hexadecimal numeric entity.",
"input":"&#x09A;",
"output": ["ParseError", ["Character", "\u0161"]]},
{"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
"input":"&#x09B;",
"output": ["ParseError", ["Character", "\u203A"]]},
{"description": "Windows-1252 LATIN SMALL LIGATURE OE hexadecimal numeric entity.",
"input":"&#x09C;",
"output": ["ParseError", ["Character", "\u0153"]]},
{"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
"input":"&#x09D;",
"output": ["ParseError", ["Character", "\u009D"]]},
{"description": "Windows-1252 LATIN SMALL LETTER Z WITH CARON hexadecimal numeric entity.",
"input":"&#x09E;",
"output": ["ParseError", ["Character", "\u017E"]]},
{"description": "Windows-1252 LATIN CAPITAL LETTER Y WITH DIAERESIS hexadecimal numeric entity.",
"input":"&#x09F;",
"output": ["ParseError", ["Character", "\u0178"]]},
{"description": "Decimal numeric entity followed by hex character a.",
"input":"&#97a",
"output": ["ParseError", ["Character", "aa"]]},
{"description": "Decimal numeric entity followed by hex character A.",
"input":"&#97A",
"output": ["ParseError", ["Character", "aA"]]},
{"description": "Decimal numeric entity followed by hex character f.",
"input":"&#97f",
"output": ["ParseError", ["Character", "af"]]},
{"description": "Decimal numeric entity followed by hex character A.",
"input":"&#97F",
"output": ["ParseError", ["Character", "aF"]]}
]}

@ -1,33 +0,0 @@
{"tests": [
{"description":"Commented close tag in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!--</xmp>--></xmp>",
"output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
{"description":"Bogus comment in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!-->baz</xmp>",
"output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
{"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!--></xmp><!-->baz</xmp>",
"output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
{"description":"Commented entities in RCDATA",
"initialStates":["RCDATA state"],
"lastStartTag":"xmp",
"input":" &amp; <!-- &amp; --> &amp; </xmp>",
"output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
{"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
"initialStates":["RCDATA state", "RAWTEXT state"],
"lastStartTag":"xmp",
"input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
"output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
]}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,7 +0,0 @@
{"tests": [
{"description":"<!---- >",
"input":"<!---- >",
"output":["ParseError", "ParseError", ["Comment","-- >"]]}
]}

@ -1,196 +0,0 @@
{"tests": [
{"description":"Correct Doctype lowercase",
"input":"<!DOCTYPE html>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype uppercase",
"input":"<!DOCTYPE HTML>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype mixed case",
"input":"<!DOCTYPE HtMl>",
"output":[["DOCTYPE", "html", null, null, true]]},
{"description":"Correct Doctype case with EOF",
"input":"<!DOCTYPE HtMl",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Truncated doctype start",
"input":"<!DOC>",
"output":["ParseError", ["Comment", "DOC"]]},
{"description":"Doctype in error",
"input":"<!DOCTYPE foo>",
"output":[["DOCTYPE", "foo", null, null, true]]},
{"description":"Single Start Tag",
"input":"<h>",
"output":[["StartTag", "h", {}]]},
{"description":"Empty end tag",
"input":"</>",
"output":["ParseError"]},
{"description":"Empty start tag",
"input":"<>",
"output":["ParseError", ["Character", "<>"]]},
{"description":"Start Tag w/attribute",
"input":"<h a='b'>",
"output":[["StartTag", "h", {"a":"b"}]]},
{"description":"Start Tag w/attribute no quotes",
"input":"<h a=b>",
"output":[["StartTag", "h", {"a":"b"}]]},
{"description":"Start/End Tag",
"input":"<h></h>",
"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
{"description":"Two unclosed start tags",
"input":"<p>One<p>Two",
"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
{"description":"End Tag w/attribute",
"input":"<h></h a='b'>",
"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
{"description":"Multiple atts",
"input":"<h a='b' c='d'>",
"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
{"description":"Multiple atts no space",
"input":"<h a='b'c='d'>",
"output":["ParseError", ["StartTag", "h", {"a":"b", "c":"d"}]]},
{"description":"Repeated attr",
"input":"<h a='b' a='d'>",
"output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
{"description":"Simple comment",
"input":"<!--comment-->",
"output":[["Comment", "comment"]]},
{"description":"Comment, Central dash no space",
"input":"<!----->",
"output":["ParseError", ["Comment", "-"]]},
{"description":"Comment, two central dashes",
"input":"<!-- --comment -->",
"output":["ParseError", ["Comment", " --comment "]]},
{"description":"Unfinished comment",
"input":"<!--comment",
"output":["ParseError", ["Comment", "comment"]]},
{"description":"Start of a comment",
"input":"<!-",
"output":["ParseError", ["Comment", "-"]]},
{"description":"Short comment",
"input":"<!-->",
"output":["ParseError", ["Comment", ""]]},
{"description":"Short comment two",
"input":"<!--->",
"output":["ParseError", ["Comment", ""]]},
{"description":"Short comment three",
"input":"<!---->",
"output":[["Comment", ""]]},
{"description":"Ampersand EOF",
"input":"&",
"output":[["Character", "&"]]},
{"description":"Ampersand ampersand EOF",
"input":"&&",
"output":[["Character", "&&"]]},
{"description":"Ampersand space EOF",
"input":"& ",
"output":[["Character", "& "]]},
{"description":"Unfinished entity",
"input":"&f",
"output":[["Character", "&f"]]},
{"description":"Ampersand, number sign",
"input":"&#",
"output":["ParseError", ["Character", "&#"]]},
{"description":"Unfinished numeric entity",
"input":"&#x",
"output":["ParseError", ["Character", "&#x"]]},
{"description":"Entity with trailing semicolon (1)",
"input":"I'm &not;it",
"output":[["Character","I'm \u00ACit"]]},
{"description":"Entity with trailing semicolon (2)",
"input":"I'm &notin;",
"output":[["Character","I'm \u2209"]]},
{"description":"Entity without trailing semicolon (1)",
"input":"I'm &notit",
"output":[["Character","I'm "], "ParseError", ["Character", "\u00ACit"]]},
{"description":"Entity without trailing semicolon (2)",
"input":"I'm &notin",
"output":[["Character","I'm "], "ParseError", ["Character", "\u00ACin"]]},
{"description":"Partial entity match at end of file",
"input":"I'm &no",
"output":[["Character","I'm &no"]]},
{"description":"Non-ASCII character reference name",
"input":"&\u00AC;",
"output":[["Character", "&\u00AC;"]]},
{"description":"ASCII decimal entity",
"input":"&#0036;",
"output":[["Character","$"]]},
{"description":"ASCII hexadecimal entity",
"input":"&#x3f;",
"output":[["Character","?"]]},
{"description":"Hexadecimal entity in attribute",
"input":"<h a='&#x3f;'></h>",
"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
{"description":"Entity in attribute without semicolon ending in x",
"input":"<h a='&notx'>",
"output":[["StartTag", "h", {"a":"&notx"}]]},
{"description":"Entity in attribute without semicolon ending in 1",
"input":"<h a='&not1'>",
"output":[["StartTag", "h", {"a":"&not1"}]]},
{"description":"Entity in attribute without semicolon ending in i",
"input":"<h a='&noti'>",
"output":[["StartTag", "h", {"a":"&noti"}]]},
{"description":"Entity in attribute without semicolon",
"input":"<h a='&COPY'>",
"output":["ParseError", ["StartTag", "h", {"a":"\u00A9"}]]},
{"description":"Unquoted attribute ending in ampersand",
"input":"<s o=& t>",
"output":[["StartTag","s",{"o":"&","t":""}]]},
{"description":"Unquoted attribute at end of tag with final character of &, with tag followed by characters",
"input":"<a a=a&>foo",
"output":[["StartTag", "a", {"a":"a&"}], ["Character", "foo"]]},
{"description":"plaintext element",
"input":"<plaintext>foobar",
"output":[["StartTag","plaintext",{}], ["Character","foobar"]]},
{"description":"Open angled bracket in unquoted attribute value state",
"input":"<a a=f<>",
"output":["ParseError", ["StartTag", "a", {"a":"f<"}]]}
]}

@ -1,179 +0,0 @@
{"tests": [
{"description":"DOCTYPE without name",
"input":"<!DOCTYPE>",
"output":["ParseError", "ParseError", ["DOCTYPE", null, null, null, false]]},
{"description":"DOCTYPE without space before name",
"input":"<!DOCTYPEhtml>",
"output":["ParseError", ["DOCTYPE", "html", null, null, true]]},
{"description":"Incorrect DOCTYPE without a space before name",
"input":"<!DOCTYPEfoo>",
"output":["ParseError", ["DOCTYPE", "foo", null, null, true]]},
{"description":"DOCTYPE with publicId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
{"description":"DOCTYPE with EOF after PUBLIC",
"input":"<!DOCTYPE html PUBLIC",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"DOCTYPE with EOF after PUBLIC '",
"input":"<!DOCTYPE html PUBLIC '",
"output":["ParseError", ["DOCTYPE", "html", "", null, false]]},
{"description":"DOCTYPE with EOF after PUBLIC 'x",
"input":"<!DOCTYPE html PUBLIC 'x",
"output":["ParseError", ["DOCTYPE", "html", "x", null, false]]},
{"description":"DOCTYPE with systemId",
"input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with publicId and systemId",
"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
"output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
{"description":"DOCTYPE with > in double-quoted publicId",
"input":"<!DOCTYPE html PUBLIC \">x",
"output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
{"description":"DOCTYPE with > in single-quoted publicId",
"input":"<!DOCTYPE html PUBLIC '>x",
"output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
{"description":"DOCTYPE with > in double-quoted systemId",
"input":"<!DOCTYPE html PUBLIC \"foo\" \">x",
"output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
{"description":"DOCTYPE with > in single-quoted systemId",
"input":"<!DOCTYPE html PUBLIC 'foo' '>x",
"output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
{"description":"Incomplete doctype",
"input":"<!DOCTYPE html ",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Numeric entity representing the NUL character",
"input":"&#0000;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing the NUL character",
"input":"&#x0000;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#2225222;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
"input":"&#x1010FFFF;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity pair representing a surrogate pair",
"input":"&#xD869;&#xDED6;",
"output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
{"description":"Hexadecimal entity with mixed uppercase and lowercase",
"input":"&#xaBcD;",
"output":[["Character", "\uABCD"]]},
{"description":"Entity without a name",
"input":"&;",
"output":[["Character", "&;"]]},
{"description":"Unescaped ampersand in attribute value",
"input":"<h a='&'>",
"output":[["StartTag", "h", { "a":"&" }]]},
{"description":"StartTag containing <",
"input":"<a<b>",
"output":[["StartTag", "a<b", { }]]},
{"description":"Non-void element containing trailing /",
"input":"<h/>",
"output":[["StartTag","h",{},true]]},
{"description":"Void element with permitted slash",
"input":"<br/>",
"output":[["StartTag","br",{},true]]},
{"description":"Void element with permitted slash (with attribute)",
"input":"<br foo='bar'/>",
"output":[["StartTag","br",{"foo":"bar"},true]]},
{"description":"StartTag containing /",
"input":"<h/a='b'>",
"output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
{"description":"Double-quoted attribute value",
"input":"<h a=\"b\">",
"output":[["StartTag", "h", { "a":"b" }]]},
{"description":"Unescaped </",
"input":"</",
"output":["ParseError", ["Character", "</"]]},
{"description":"Illegal end tag name",
"input":"</1>",
"output":["ParseError", ["Comment", "1"]]},
{"description":"Simili processing instruction",
"input":"<?namespace>",
"output":["ParseError", ["Comment", "?namespace"]]},
{"description":"A bogus comment stops at >, even if preceeded by two dashes",
"input":"<?foo-->",
"output":["ParseError", ["Comment", "?foo--"]]},
{"description":"Unescaped <",
"input":"foo < bar",
"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
{"description":"Null Byte Replacement",
"input":"\u0000",
"output":["ParseError", ["Character", "\u0000"]]},
{"description":"Comment with dash",
"input":"<!---x",
"output":["ParseError", ["Comment", "-x"]]},
{"description":"Entity + newline",
"input":"\nx\n&gt;\n",
"output":[["Character","\nx\n>\n"]]},
{"description":"Start tag with no attributes but space before the greater-than sign",
"input":"<h >",
"output":[["StartTag", "h", {}]]},
{"description":"Empty attribute followed by uppercase attribute",
"input":"<h a B=''>",
"output":[["StartTag", "h", {"a":"", "b":""}]]},
{"description":"Double-quote after attribute name",
"input":"<h a \">",
"output":["ParseError", ["StartTag", "h", {"a":"", "\"":""}]]},
{"description":"Single-quote after attribute name",
"input":"<h a '>",
"output":["ParseError", ["StartTag", "h", {"a":"", "'":""}]]},
{"description":"Empty end tag with following characters",
"input":"a</>bc",
"output":[["Character", "a"], "ParseError", ["Character", "bc"]]},
{"description":"Empty end tag with following tag",
"input":"a</><b>c",
"output":[["Character", "a"], "ParseError", ["StartTag", "b", {}], ["Character", "c"]]},
{"description":"Empty end tag with following comment",
"input":"a</><!--b-->c",
"output":[["Character", "a"], "ParseError", ["Comment", "b"], ["Character", "c"]]},
{"description":"Empty end tag with following end tag",
"input":"a</></b>c",
"output":[["Character", "a"], "ParseError", ["EndTag", "b"], ["Character", "c"]]}
]}

File diff suppressed because it is too large Load Diff

@ -1,344 +0,0 @@
{"tests": [
{"description":"< in attribute name",
"input":"<z/0 <>",
"output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "<": ""}]]},
{"description":"< in attribute value",
"input":"<z x=<>",
"output":["ParseError", ["StartTag", "z", {"x": "<"}]]},
{"description":"= in unquoted attribute value",
"input":"<z z=z=z>",
"output":["ParseError", ["StartTag", "z", {"z": "z=z"}]]},
{"description":"= attribute",
"input":"<z =>",
"output":["ParseError", ["StartTag", "z", {"=": ""}]]},
{"description":"== attribute",
"input":"<z ==>",
"output":["ParseError", "ParseError", ["StartTag", "z", {"=": ""}]]},
{"description":"=== attribute",
"input":"<z ===>",
"output":["ParseError", "ParseError", ["StartTag", "z", {"=": "="}]]},
{"description":"==== attribute",
"input":"<z ====>",
"output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"=": "=="}]]},
{"description":"\" after ampersand in double-quoted attribute value",
"input":"<z z=\"&\">",
"output":[["StartTag", "z", {"z": "&"}]]},
{"description":"' after ampersand in double-quoted attribute value",
"input":"<z z=\"&'\">",
"output":[["StartTag", "z", {"z": "&'"}]]},
{"description":"' after ampersand in single-quoted attribute value",
"input":"<z z='&'>",
"output":[["StartTag", "z", {"z": "&"}]]},
{"description":"\" after ampersand in single-quoted attribute value",
"input":"<z z='&\"'>",
"output":[["StartTag", "z", {"z": "&\""}]]},
{"description":"Text after bogus character reference",
"input":"<z z='&xlink_xmlns;'>bar<z>",
"output":[["StartTag","z",{"z":"&xlink_xmlns;"}],["Character","bar"],["StartTag","z",{}]]},
{"description":"Text after hex character reference",
"input":"<z z='&#x0020; foo'>bar<z>",
"output":[["StartTag","z",{"z":" foo"}],["Character","bar"],["StartTag","z",{}]]},
{"description":"Attribute name starting with \"",
"input":"<foo \"='bar'>",
"output":["ParseError", ["StartTag", "foo", {"\"": "bar"}]]},
{"description":"Attribute name starting with '",
"input":"<foo '='bar'>",
"output":["ParseError", ["StartTag", "foo", {"'": "bar"}]]},
{"description":"Attribute name containing \"",
"input":"<foo a\"b='bar'>",
"output":["ParseError", ["StartTag", "foo", {"a\"b": "bar"}]]},
{"description":"Attribute name containing '",
"input":"<foo a'b='bar'>",
"output":["ParseError", ["StartTag", "foo", {"a'b": "bar"}]]},
{"description":"Unquoted attribute value containing '",
"input":"<foo a=b'c>",
"output":["ParseError", ["StartTag", "foo", {"a": "b'c"}]]},
{"description":"Unquoted attribute value containing \"",
"input":"<foo a=b\"c>",
"output":["ParseError", ["StartTag", "foo", {"a": "b\"c"}]]},
{"description":"Double-quoted attribute value not followed by whitespace",
"input":"<foo a=\"b\"c>",
"output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
{"description":"Single-quoted attribute value not followed by whitespace",
"input":"<foo a='b'c>",
"output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
{"description":"Quoted attribute followed by permitted /",
"input":"<br a='b'/>",
"output":[["StartTag","br",{"a":"b"},true]]},
{"description":"Quoted attribute followed by non-permitted /",
"input":"<bar a='b'/>",
"output":[["StartTag","bar",{"a":"b"},true]]},
{"description":"CR EOF after doctype name",
"input":"<!doctype html \r",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"CR EOF in tag name",
"input":"<z\r",
"output":["ParseError"]},
{"description":"Slash EOF in tag name",
"input":"<z/",
"output":["ParseError"]},
{"description":"Zero hex numeric entity",
"input":"&#x0",
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
{"description":"Zero decimal numeric entity",
"input":"&#0",
"output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
{"description":"Zero-prefixed hex numeric entity",
"input":"&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041;",
"output":[["Character", "A"]]},
{"description":"Zero-prefixed decimal numeric entity",
"input":"&#000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000065;",
"output":[["Character", "A"]]},
{"description":"Empty hex numeric entities",
"input":"&#x &#X ",
"output":["ParseError", ["Character", "&#x "], "ParseError", ["Character", "&#X "]]},
{"description":"Empty decimal numeric entities",
"input":"&# &#; ",
"output":["ParseError", ["Character", "&# "], "ParseError", ["Character", "&#; "]]},
{"description":"Non-BMP numeric entity",
"input":"&#x10000;",
"output":[["Character", "\uD800\uDC00"]]},
{"description":"Maximum non-BMP numeric entity",
"input":"&#X10FFFF;",
"output":["ParseError", ["Character", "\uDBFF\uDFFF"]]},
{"description":"Above maximum numeric entity",
"input":"&#x110000;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"32-bit hex numeric entity",
"input":"&#x80000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"33-bit hex numeric entity",
"input":"&#x100000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"33-bit decimal numeric entity",
"input":"&#4294967361;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"65-bit hex numeric entity",
"input":"&#x10000000000000041;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"65-bit decimal numeric entity",
"input":"&#18446744073709551681;",
"output":["ParseError", ["Character", "\uFFFD"]]},
{"description":"Surrogate code point edge cases",
"input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
"output":[["Character", "\uD7FF"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD\uE000"]]},
{"description":"Uppercase start tag name",
"input":"<X>",
"output":[["StartTag", "x", {}]]},
{"description":"Uppercase end tag name",
"input":"</X>",
"output":[["EndTag", "x"]]},
{"description":"Uppercase attribute name",
"input":"<x X>",
"output":[["StartTag", "x", { "x":"" }]]},
{"description":"Tag/attribute name case edge values",
"input":"<x@AZ[`az{ @AZ[`az{>",
"output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
{"description":"Duplicate different-case attributes",
"input":"<x x=1 x=2 X=3>",
"output":["ParseError", "ParseError", ["StartTag", "x", { "x":"1" }]]},
{"description":"Uppercase close tag attributes",
"input":"</x X>",
"output":["ParseError", ["EndTag", "x"]]},
{"description":"Duplicate close tag attributes",
"input":"</x x x>",
"output":["ParseError", "ParseError", ["EndTag", "x"]]},
{"description":"Permitted slash",
"input":"<br/>",
"output":[["StartTag","br",{},true]]},
{"description":"Non-permitted slash",
"input":"<xr/>",
"output":[["StartTag","xr",{},true]]},
{"description":"Permitted slash but in close tag",
"input":"</br/>",
"output":["ParseError", ["EndTag", "br"]]},
{"description":"Doctype public case-sensitivity (1)",
"input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
"output":[["DOCTYPE", "html", "AbC", "XyZ", true]]},
{"description":"Doctype public case-sensitivity (2)",
"input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
"output":[["DOCTYPE", "html", "aBc", "xYz", true]]},
{"description":"Doctype system case-sensitivity (1)",
"input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
"output":[["DOCTYPE", "html", null, "XyZ", true]]},
{"description":"Doctype system case-sensitivity (2)",
"input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
"output":[["DOCTYPE", "html", null, "xYz", true]]},
{"description":"U+0000 in lookahead region after non-matching character",
"input":"<!doc>\u0000",
"output":["ParseError", ["Comment", "doc"], "ParseError", ["Character", "\u0000"]],
"ignoreErrorOrder":true},
{"description":"U+0000 in lookahead region",
"input":"<!doc\u0000",
"output":["ParseError", ["Comment", "doc\uFFFD"]],
"ignoreErrorOrder":true},
{"description":"U+0080 in lookahead region",
"input":"<!doc\u0080",
"output":["ParseError", "ParseError", ["Comment", "doc\u0080"]],
"ignoreErrorOrder":true},
{"description":"U+FDD1 in lookahead region",
"input":"<!doc\uFDD1",
"output":["ParseError", "ParseError", ["Comment", "doc\uFDD1"]],
"ignoreErrorOrder":true},
{"description":"U+1FFFF in lookahead region",
"input":"<!doc\uD83F\uDFFF",
"output":["ParseError", "ParseError", ["Comment", "doc\uD83F\uDFFF"]],
"ignoreErrorOrder":true},
{"description":"CR followed by non-LF",
"input":"\r?",
"output":[["Character", "\n?"]]},
{"description":"CR at EOF",
"input":"\r",
"output":[["Character", "\n"]]},
{"description":"LF at EOF",
"input":"\n",
"output":[["Character", "\n"]]},
{"description":"CR LF",
"input":"\r\n",
"output":[["Character", "\n"]]},
{"description":"CR CR",
"input":"\r\r",
"output":[["Character", "\n\n"]]},
{"description":"LF LF",
"input":"\n\n",
"output":[["Character", "\n\n"]]},
{"description":"LF CR",
"input":"\n\r",
"output":[["Character", "\n\n"]]},
{"description":"text CR CR CR text",
"input":"text\r\r\rtext",
"output":[["Character", "text\n\n\ntext"]]},
{"description":"Doctype publik",
"input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype publi",
"input":"<!DOCTYPE html PUBLI",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype sistem",
"input":"<!DOCTYPE html SISTEM \"AbC\">",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype sys",
"input":"<!DOCTYPE html SYS",
"output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
{"description":"Doctype html x>text",
"input":"<!DOCTYPE html x>text",
"output":["ParseError", ["DOCTYPE", "html", null, null, false], ["Character", "text"]]},
{"description":"Grave accent in unquoted attribute",
"input":"<a a=aa`>",
"output":["ParseError", ["StartTag", "a", {"a":"aa`"}]]},
{"description":"EOF in tag name state ",
"input":"<a",
"output":["ParseError"]},
{"description":"EOF in tag name state",
"input":"<a",
"output":["ParseError"]},
{"description":"EOF in before attribute name state",
"input":"<a ",
"output":["ParseError"]},
{"description":"EOF in attribute name state",
"input":"<a a",
"output":["ParseError"]},
{"description":"EOF in after attribute name state",
"input":"<a a ",
"output":["ParseError"]},
{"description":"EOF in before attribute value state",
"input":"<a a =",
"output":["ParseError"]},
{"description":"EOF in attribute value (double quoted) state",
"input":"<a a =\"a",
"output":["ParseError"]},
{"description":"EOF in attribute value (single quoted) state",
"input":"<a a ='a",
"output":["ParseError"]},
{"description":"EOF in attribute value (unquoted) state",
"input":"<a a =a",
"output":["ParseError"]},
{"description":"EOF in after attribute value state",
"input":"<a a ='a'",
"output":["ParseError"]}
]}

File diff suppressed because it is too large Load Diff

@ -1,31 +0,0 @@
{"tests" : [
{"description": "Invalid Unicode character U+DFFF",
"doubleEscaped":true,
"input": "\\uDFFF",
"output":["ParseError", ["Character", "\\uDFFF"]],
"ignoreErrorOrder":true},
{"description": "Invalid Unicode character U+D800",
"doubleEscaped":true,
"input": "\\uD800",
"output":["ParseError", ["Character", "\\uD800"]],
"ignoreErrorOrder":true},
{"description": "Invalid Unicode character U+DFFF with valid preceding character",
"doubleEscaped":true,
"input": "a\\uDFFF",
"output":[["Character", "a"], "ParseError", ["Character", "\\uDFFF"]],
"ignoreErrorOrder":true},
{"description": "Invalid Unicode character U+D800 with valid following character",
"doubleEscaped":true,
"input": "\\uD800a",
"output":["ParseError", ["Character", "\\uD800a"]],
"ignoreErrorOrder":true},
{"description":"CR followed by U+0000",
"input":"\r\u0000",
"output":[["Character", "\n"], "ParseError", ["Character", "\u0000"]],
"ignoreErrorOrder":true}
]
}

@ -1,22 +0,0 @@
{"xmlViolationTests": [
{"description":"Non-XML character",
"input":"a\uFFFFb",
"ignoreErrorOrder":true,
"output":["ParseError",["Character","a\uFFFDb"]]},
{"description":"Non-XML space",
"input":"a\u000Cb",
"ignoreErrorOrder":true,
"output":[["Character","a b"]]},
{"description":"Double hyphen in comment",
"input":"<!-- foo -- bar -->",
"output":["ParseError",["Comment"," foo - - bar "]]},
{"description":"FF between attributes",
"input":"<a b=''\u000Cc=''>",
"output":[["StartTag","a",{"b":"","c":""}]]}
]}

@ -1,104 +0,0 @@
Tree Construction Tests
=======================
Each file containing tree construction tests consists of any number of
tests separated by two newlines (LF) and a single newline before the end
of the file. For instance:
[TEST]LF
LF
[TEST]LF
LF
[TEST]LF
Where [TEST] is the following format:
Each test must begin with a string "\#data" followed by a newline (LF).
All subsequent lines until a line that says "\#errors" are the test data
and must be passed to the system being tested unchanged, except with the
final newline (on the last line) removed.
Then there must be a line that says "\#errors". It must be followed by
one line per parse error that a conformant checker would return. It
doesn't matter what those lines are, although they can't be
"\#document-fragment", "\#document", "\#script-off", "\#script-on", or
empty, the only thing that matters is that there be the right number
of parse errors.
Then there \*may\* be a line that says "\#document-fragment", which must
be followed by a newline (LF), followed by a string of characters that
indicates the context element, followed by a newline (LF). If the string
of characters starts with "svg ", the context element is in the SVG
namespace and the substring after "svg " is the local name. If the
string of characters starts with "math ", the context element is in the
MathML namespace and the substring after "math " is the local name.
Otherwise, the context element is in the HTML namespace and the string
is the local name. If this line is present the "\#data" must be parsed
using the HTML fragment parsing algorithm with the context element as
context.
Then there \*may\* be a line that says "\#script-off" or
"\#script-on". If a line that says "\#script-off" is present, the
parser must set the scripting flag to disabled. If a line that says
"\#script-on" is present, it must set it to enabled. Otherwise, the
test should be run in both modes.
Then there must be a line that says "\#document", which must be followed
by a dump of the tree of the parsed DOM. Each node must be represented
by a single line. Each line must start with "| ", followed by two spaces
per parent node that the node has before the root document node.
- Element nodes must be represented by a "`<`" then the *tag name
string* "`>`", and all the attributes must be given, sorted
lexicographically by UTF-16 code unit according to their *attribute
name string*, on subsequent lines, as if they were children of the
element node.
- Attribute nodes must have the *attribute name string*, then an "="
sign, then the attribute value in double quotes (").
- Text nodes must be the string, in double quotes. Newlines aren't
escaped.
- Comments must be "`<`" then "`!-- `" then the data then "` -->`".
- DOCTYPEs must be "`<!DOCTYPE `" then the name then if either of the
system id or public id is non-empty a space, public id in
double-quotes, another space an the system id in double-quotes, and
then in any case "`>`".
- Processing instructions must be "`<?`", then the target, then a
space, then the data and then "`>`". (The HTML parser cannot emit
processing instructions, but scripts can, and the WebVTT to DOM
rules can emit them.)
- Template contents are represented by the string "content" with the
children below it.
The *tag name string* is the local name prefixed by a namespace
designator. For the HTML namespace, the namespace designator is the
empty string, i.e. there's no prefix. For the SVG namespace, the
namespace designator is "svg ". For the MathML namespace, the namespace
designator is "math ".
The *attribute name string* is the local name prefixed by a namespace
designator. For no namespace, the namespace designator is the empty
string, i.e. there's no prefix. For the XLink namespace, the namespace
designator is "xlink ". For the XML namespace, the namespace designator
is "xml ". For the XMLNS namespace, the namespace designator is "xmlns
". Note the difference between "xlink:href" which is an attribute in no
namespace with the local name "xlink:href" and "xlink href" which is an
attribute in the xlink namespace with the local name "href".
If there is also a "\#document-fragment" the bit following "\#document"
must be a representation of the HTML fragment serialization for the
context element given by "\#document-fragment".
For example:
#data
<p>One<p>Two
#errors
3: Missing document type declaration
#document
| <html>
| <head>
| <body>
| <p>
| "One"
| <p>
| "Two"

@ -1,354 +0,0 @@
#data
<a><p></a></p>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,10): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| <p>
| <a>
#data
<a>1<p>2</a>3</p>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,12): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <p>
| <a>
| "2"
| "3"
#data
<a>1<button>2</a>3</button>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,17): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <button>
| <a>
| "2"
| "3"
#data
<a>1<b>2</a>3</b>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,12): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <b>
| "2"
| <b>
| "3"
#data
<a>1<div>2<div>3</a>4</div>5</div>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,20): adoption-agency-1.3
(1,20): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <div>
| <a>
| "2"
| <div>
| <a>
| "3"
| "4"
| "5"
#data
<table><a>1<p>2</a>3</p>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,10): unexpected-start-tag-implies-table-voodoo
(1,11): unexpected-character-implies-table-voodoo
(1,14): unexpected-start-tag-implies-table-voodoo
(1,15): unexpected-character-implies-table-voodoo
(1,19): unexpected-end-tag-implies-table-voodoo
(1,19): adoption-agency-1.3
(1,20): unexpected-character-implies-table-voodoo
(1,24): unexpected-end-tag-implies-table-voodoo
(1,24): eof-in-table
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <p>
| <a>
| "2"
| "3"
| <table>
#data
<b><b><a><p></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <b>
| <b>
| <a>
| <p>
| <a>
#data
<b><a><b><p></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <b>
| <a>
| <b>
| <b>
| <p>
| <a>
#data
<a><b><b><p></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,16): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <a>
| <b>
| <b>
| <b>
| <b>
| <p>
| <a>
#data
<p>1<s id="A">2<b id="B">3</p>4</s>5</b>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,30): unexpected-end-tag
(1,35): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <p>
| "1"
| <s>
| id="A"
| "2"
| <b>
| id="B"
| "3"
| <s>
| id="A"
| <b>
| id="B"
| "4"
| <b>
| id="B"
| "5"
#data
<table><a>1<td>2</td>3</table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,10): unexpected-start-tag-implies-table-voodoo
(1,11): unexpected-character-implies-table-voodoo
(1,15): unexpected-cell-in-table-body
(1,30): unexpected-implied-end-tag-in-table-view
#document
| <html>
| <head>
| <body>
| <a>
| "1"
| <a>
| "3"
| <table>
| <tbody>
| <tr>
| <td>
| "2"
#data
<table>A<td>B</td>C</table>
#errors
(1,7): expected-doctype-but-got-start-tag
(1,8): unexpected-character-implies-table-voodoo
(1,12): unexpected-cell-in-table-body
(1,22): unexpected-character-implies-table-voodoo
#document
| <html>
| <head>
| <body>
| "AC"
| <table>
| <tbody>
| <tr>
| <td>
| "B"
#data
<a><svg><tr><input></a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,23): unexpected-end-tag
(1,23): adoption-agency-1.3
#document
| <html>
| <head>
| <body>
| <a>
| <svg svg>
| <svg tr>
| <svg input>
#data
<div><a><b><div><div><div><div><div><div><div><div><div><div></a>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): adoption-agency-1.3
(1,65): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <a>
| <b>
| <b>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <a>
| <div>
| <div>
#data
<div><a><b><u><i><code><div></a>
#errors
(1,5): expected-doctype-but-got-start-tag
(1,32): adoption-agency-1.3
(1,32): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <div>
| <a>
| <b>
| <u>
| <i>
| <code>
| <u>
| <i>
| <code>
| <div>
| <a>
#data
<b><b><b><b>x</b></b></b></b>y
#errors
(1,3): expected-doctype-but-got-start-tag
#document
| <html>
| <head>
| <body>
| <b>
| <b>
| <b>
| <b>
| "x"
| "y"
#data
<p><b><b><b><b><p>x
#errors
(1,3): expected-doctype-but-got-start-tag
(1,18): unexpected-end-tag
(1,19): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <p>
| <b>
| <b>
| <b>
| <b>
| <p>
| <b>
| <b>
| <b>
| "x"
#data
<b><em><foo><foob><fooc><aside></b></em>
#errors
(1,35): adoption-agency-1.3
(1,40): adoption-agency-1.3
(1,40): expected-closing-tag-but-got-eof
#document-fragment
div
#document
| <b>
| <em>
| <foo>
| <foob>
| <fooc>
| <aside>
| <b>

@ -1,39 +0,0 @@
#data
<b>1<i>2<p>3</b>4
#errors
(1,3): expected-doctype-but-got-start-tag
(1,16): adoption-agency-1.3
(1,17): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <b>
| "1"
| <i>
| "2"
| <i>
| <p>
| <b>
| "3"
| "4"
#data
<a><div><style></style><address><a>
#errors
(1,3): expected-doctype-but-got-start-tag
(1,35): unexpected-start-tag-implies-end-tag
(1,35): adoption-agency-1.3
(1,35): adoption-agency-1.3
(1,35): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| <a>
| <div>
| <a>
| <style>
| <address>
| <a>
| <a>

@ -1,178 +0,0 @@
#data
FOO<!-- BAR -->BAZ
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -->
| "BAZ"
#data
FOO<!-- BAR --!>BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-bang-after-double-dash-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -->
| "BAZ"
#data
FOO<!-- BAR -- >BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,21): eof-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- >BAZ -->
#data
FOO<!-- BAR -- <QUX> -- MUX -->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,24): unexpected-char-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- <QUX> -- MUX -->
| "BAZ"
#data
FOO<!-- BAR -- <QUX> -- MUX --!>BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,24): unexpected-char-in-comment
(1,31): unexpected-bang-after-double-dash-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- <QUX> -- MUX -->
| "BAZ"
#data
FOO<!-- BAR -- <QUX> -- MUX -- >BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,15): unexpected-char-in-comment
(1,24): unexpected-char-in-comment
(1,31): unexpected-char-in-comment
(1,35): eof-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- BAR -- <QUX> -- MUX -- >BAZ -->
#data
FOO<!---->BAZ
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- -->
| "BAZ"
#data
FOO<!--->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,9): incorrect-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- -->
| "BAZ"
#data
FOO<!-->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,8): incorrect-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- -->
| "BAZ"
#data
<?xml version="1.0">Hi
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,22): expected-doctype-but-got-chars
#document
| <!-- ?xml version="1.0" -->
| <html>
| <head>
| <body>
| "Hi"
#data
<?xml version="1.0">
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,20): expected-doctype-but-got-eof
#document
| <!-- ?xml version="1.0" -->
| <html>
| <head>
| <body>
#data
<?xml version
#errors
(1,1): expected-tag-name-but-got-question-mark
(1,13): expected-doctype-but-got-eof
#document
| <!-- ?xml version -->
| <html>
| <head>
| <body>
#data
FOO<!----->BAZ
#errors
(1,3): expected-doctype-but-got-chars
(1,10): unexpected-dash-after-double-dash-in-comment
#document
| <html>
| <head>
| <body>
| "FOO"
| <!-- - -->
| "BAZ"
#data
<html><!-- comment --><title>Comment before head</title>
#errors
(1,6): expected-doctype-but-got-start-tag
#document
| <html>
| <!-- comment -->
| <head>
| <title>
| "Comment before head"
| <body>

@ -1,424 +0,0 @@
#data
<!DOCTYPE html>Hello
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "Hello"
#data
<!dOctYpE HtMl>Hello
#errors
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPEhtml>Hello
#errors
(1,9): need-space-after-doctype
#document
| <!DOCTYPE html>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE>Hello
#errors
(1,9): need-space-after-doctype
(1,10): expected-doctype-name-but-got-right-bracket
(1,10): unknown-doctype
#document
| <!DOCTYPE >
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE >Hello
#errors
(1,11): expected-doctype-name-but-got-right-bracket
(1,11): unknown-doctype
#document
| <!DOCTYPE >
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato>Hello
#errors
(1,17): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato >Hello
#errors
(1,18): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato taco>Hello
#errors
(1,17): expected-space-or-right-bracket-in-doctype
(1,22): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato taco "ddd>Hello
#errors
(1,17): expected-space-or-right-bracket-in-doctype
(1,27): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato sYstEM>Hello
#errors
(1,24): unexpected-char-in-doctype
(1,24): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato sYstEM >Hello
#errors
(1,28): unexpected-char-in-doctype
(1,28): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato sYstEM ggg>Hello
#errors
(1,34): unexpected-char-in-doctype
(1,37): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM taco >Hello
#errors
(1,25): unexpected-char-in-doctype
(1,31): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM 'taco"'>Hello
#errors
(1,32): unknown-doctype
#document
| <!DOCTYPE potato "" "taco"">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM "taco">Hello
#errors
(1,31): unknown-doctype
#document
| <!DOCTYPE potato "" "taco">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEM "tai'co">Hello
#errors
(1,33): unknown-doctype
#document
| <!DOCTYPE potato "" "tai'co">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato SYSTEMtaco "ddd">Hello
#errors
(1,24): unexpected-char-in-doctype
(1,34): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato grass SYSTEM taco>Hello
#errors
(1,17): expected-space-or-right-bracket-in-doctype
(1,35): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato pUbLIc>Hello
#errors
(1,24): unexpected-end-of-doctype
(1,24): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato pUbLIc >Hello
#errors
(1,25): unexpected-end-of-doctype
(1,25): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato pUbLIcgoof>Hello
#errors
(1,24): unexpected-char-in-doctype
(1,28): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC goof>Hello
#errors
(1,25): unexpected-char-in-doctype
(1,29): unknown-doctype
#document
| <!DOCTYPE potato>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC "go'of">Hello
#errors
(1,32): unknown-doctype
#document
| <!DOCTYPE potato "go'of" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC 'go'of'>Hello
#errors
(1,29): unexpected-char-in-doctype
(1,32): unknown-doctype
#document
| <!DOCTYPE potato "go" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC 'go:hh of' >Hello
#errors
(1,38): unknown-doctype
#document
| <!DOCTYPE potato "go:hh of" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE potato PUBLIC "W3C-//dfdf" SYSTEM ggg>Hello
#errors
(1,38): unexpected-char-in-doctype
(1,48): unknown-doctype
#document
| <!DOCTYPE potato "W3C-//dfdf" "">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">Hello
#errors
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE ...>Hello
#errors
(1,14): unknown-doctype
#document
| <!DOCTYPE ...>
| <html>
| <head>
| <body>
| "Hello"
#data
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
#errors
(2,58): unknown-doctype
#document
| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
#errors
(2,54): unknown-doctype
#document
| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE root-element [SYSTEM OR PUBLIC FPI] "uri" [
<!-- internal declarations -->
]>
#errors
(1,23): expected-space-or-right-bracket-in-doctype
(2,30): unknown-doctype
#document
| <!DOCTYPE root-element>
| <html>
| <head>
| <body>
| "]>"
#data
<!DOCTYPE html PUBLIC
"-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
"http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
#errors
(3,53): unknown-doctype
#document
| <!DOCTYPE html "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"><body><b>Mine!</b></body>
#errors
(1,63): unknown-doctype
#document
| <!DOCTYPE html "" "http://www.w3.org/DTD/HTML4-strict.dtd">
| <html>
| <head>
| <body>
| <b>
| "Mine!"
#data
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">
#errors
(1,50): unexpected-char-in-doctype
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
#errors
(1,50): unexpected-char-in-doctype
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
#errors
(1,21): unexpected-char-in-doctype
(1,49): unexpected-char-in-doctype
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>
#data
<!DOCTYPE HTML PUBLIC'-//W3C//DTD HTML 4.01//EN''http://www.w3.org/TR/html4/strict.dtd'>
#errors
(1,21): unexpected-char-in-doctype
(1,49): unexpected-char-in-doctype
#document
| <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
| <html>
| <head>
| <body>

@ -1,795 +0,0 @@
#data
FOO&gt;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO>BAR"
#data
FOO&gtBAR
#errors
(1,3): expected-doctype-but-got-chars
(1,6): named-entity-without-semicolon
#document
| <html>
| <head>
| <body>
| "FOO>BAR"
#data
FOO&gt BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,6): named-entity-without-semicolon
#document
| <html>
| <head>
| <body>
| "FOO> BAR"
#data
FOO&gt;;;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO>;;BAR"
#data
I'm &notit; I tell you
#errors
(1,4): expected-doctype-but-got-chars
(1,9): named-entity-without-semicolon
#document
| <html>
| <head>
| <body>
| "I'm ¬it; I tell you"
#data
I'm &notin; I tell you
#errors
(1,4): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "I'm ∉ I tell you"
#data
FOO& BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO& BAR"
#data
FOO&<BAR>
#errors
(1,3): expected-doctype-but-got-chars
(1,9): expected-closing-tag-but-got-eof
#document
| <html>
| <head>
| <body>
| "FOO&"
| <bar>
#data
FOO&&&&gt;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO&&&>BAR"
#data
FOO&#41;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO)BAR"
#data
FOO&#x41;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOABAR"
#data
FOO&#X41;BAR
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOABAR"
#data
FOO&#BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,5): expected-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO&#BAR"
#data
FOO&#ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,5): expected-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO&#ZOO"
#data
FOO&#xBAR
#errors
(1,3): expected-doctype-but-got-chars
(1,7): expected-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOºR"
#data
FOO&#xZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,6): expected-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO&#xZOO"
#data
FOO&#XZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,6): expected-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO&#XZOO"
#data
FOO&#41BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,7): numeric-entity-without-semicolon
#document
| <html>
| <head>
| <body>
| "FOO)BAR"
#data
FOO&#x41BAR
#errors
(1,3): expected-doctype-but-got-chars
(1,10): numeric-entity-without-semicolon
#document
| <html>
| <head>
| <body>
| "FOO䆺R"
#data
FOO&#x41ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,8): numeric-entity-without-semicolon
#document
| <html>
| <head>
| <body>
| "FOOAZOO"
#data
FOO&#x0000;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#x0078;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOxZOO"
#data
FOO&#x0079;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOyZOO"
#data
FOO&#x0080;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO€ZOO"
#data
FOO&#x0081;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0082;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0083;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOƒZOO"
#data
FOO&#x0084;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO„ZOO"
#data
FOO&#x0085;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO…ZOO"
#data
FOO&#x0086;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO†ZOO"
#data
FOO&#x0087;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO‡ZOO"
#data
FOO&#x0088;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOˆZOO"
#data
FOO&#x0089;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO‰ZOO"
#data
FOO&#x008A;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOŠZOO"
#data
FOO&#x008B;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x008C;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOŒZOO"
#data
FOO&#x008D;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x008E;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOŽZOO"
#data
FOO&#x008F;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0090;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0091;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0092;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0093;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO“ZOO"
#data
FOO&#x0094;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO”ZOO"
#data
FOO&#x0095;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO•ZOO"
#data
FOO&#x0096;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x0097;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO—ZOO"
#data
FOO&#x0098;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO˜ZOO"
#data
FOO&#x0099;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO™ZOO"
#data
FOO&#x009A;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOšZOO"
#data
FOO&#x009B;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x009C;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOœZOO"
#data
FOO&#x009D;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x009E;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOžZOO"
#data
FOO&#x009F;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOOŸZOO"
#data
FOO&#x00A0;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO ZOO"
#data
FOO&#xD7FF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO퟿ZOO"
#data
FOO&#xD800;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xD801;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xDFFE;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xDFFF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,11): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xE000;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOOZOO"
#data
FOO&#x10FFFE;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO􏿾ZOO"
#data
FOO&#x1087D4;ZOO
#errors
(1,3): expected-doctype-but-got-chars
#document
| <html>
| <head>
| <body>
| "FOO􈟔ZOO"
#data
FOO&#x10FFFF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO􏿿ZOO"
#data
FOO&#x110000;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#xFFFFFF;ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#11111111111
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
(1,13): eof-in-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>"
#data
FOO&#1111111111
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
(1,13): eof-in-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>"
#data
FOO&#111111111111
#errors
(1,3): expected-doctype-but-got-chars
(1,13): illegal-codepoint-for-numeric-entity
(1,13): eof-in-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>"
#data
FOO&#11111111111ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,16): numeric-entity-without-semicolon
(1,16): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#1111111111ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,15): numeric-entity-without-semicolon
(1,15): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"
#data
FOO&#111111111111ZOO
#errors
(1,3): expected-doctype-but-got-chars
(1,17): numeric-entity-without-semicolon
(1,17): illegal-codepoint-for-numeric-entity
#document
| <html>
| <head>
| <body>
| "FOO<4F>ZOO"

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save