from __future__ import absolute_import, division, unicode_literals import os import pytest from .support import get_data_files, test_dir, errorMessage, TestData as _TestData from html5lib import HTMLParser, _inputstream def test_basic_prescan_length(): data = "Caf\u00E9".encode('utf-8') pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'utf-8' == stream.charEncoding[0].name def test_parser_reparse(): data = "Caf\u00E9".encode('utf-8') pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) assert 'utf-8' == p.documentEncoding assert doc.find(".//title").text == "Caf\u00E9" @pytest.mark.parametrize("expected,data,kwargs", [ ("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}), ("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}), ("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}), ("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}), ("iso-8859-2", b"", {"transport_encoding": "iso-8859-2"}), ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-3"}), ("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}), ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}), ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}), ("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}), ("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}), ("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}), ("windows-1252", b"", {"default_encoding": "totally-bogus-string"}), ("windows-1252", b"", {}), ]) def test_parser_args(expected, data, kwargs): stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) assert expected == stream.charEncoding[0].name p = HTMLParser() p.parse(data, useChardet=False, **kwargs) assert expected == p.documentEncoding @pytest.mark.parametrize("kwargs", [ {"override_encoding": "iso-8859-2"}, {"override_encoding": None}, {"transport_encoding": "iso-8859-2"}, {"transport_encoding": None}, {"same_origin_parent_encoding": "iso-8859-2"}, {"same_origin_parent_encoding": None}, {"likely_encoding": "iso-8859-2"}, {"likely_encoding": None}, {"default_encoding": "iso-8859-2"}, {"default_encoding": None}, {"foo_encoding": "iso-8859-2"}, {"foo_encoding": None}, ]) def test_parser_args_raises(kwargs): with pytest.raises(TypeError) as exc_info: p = HTMLParser() p.parse("", useChardet=False, **kwargs) assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input") def param_encoding(): for filename in get_data_files("encoding"): tests = _TestData(filename, b"data", encoding=None) for test in tests: yield test[b'data'], test[b'encoding'] @pytest.mark.parametrize("data, encoding", param_encoding()) def test_parser_encoding(data, encoding): p = HTMLParser() assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding) @pytest.mark.parametrize("data, encoding", param_encoding()) def test_prescan_encoding(data, encoding): stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) encoding = encoding.lower().decode("ascii") # Very crude way to ignore irrelevant tests if len(data) > stream.numBytesMeta: return assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name) # pylint:disable=wrong-import-position try: import chardet # noqa except ImportError: print("chardet not found, skipping chardet tests") else: def test_chardet(): with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: encoding = _inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" # pylint:enable=wrong-import-position