|
|
|
from __future__ import absolute_import, division, unicode_literals
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import json
|
|
|
|
import re
|
|
|
|
|
|
|
|
import html5lib
|
|
|
|
from . import support
|
|
|
|
from . import test_tokenizer
|
|
|
|
|
|
|
|
p = html5lib.HTMLParser()
|
|
|
|
|
|
|
|
unnamespaceExpected = re.compile(r"^(\|\s*)<html ([^>]+)>", re.M).sub
|
|
|
|
|
|
|
|
|
|
|
|
def main(out_path):
|
|
|
|
if not os.path.exists(out_path):
|
|
|
|
sys.stderr.write("Path %s does not exist" % out_path)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
for filename in support.get_data_files('tokenizer', '*.test'):
|
|
|
|
run_file(filename, out_path)
|
|
|
|
|
|
|
|
|
|
|
|
def run_file(filename, out_path):
|
|
|
|
try:
|
|
|
|
tests_data = json.load(open(filename, "r"))
|
|
|
|
except ValueError:
|
|
|
|
sys.stderr.write("Failed to load %s\n" % filename)
|
|
|
|
return
|
|
|
|
name = os.path.splitext(os.path.split(filename)[1])[0]
|
|
|
|
output_file = open(os.path.join(out_path, "tokenizer_%s.dat" % name), "w")
|
|
|
|
|
|
|
|
if 'tests' in tests_data:
|
|
|
|
for test_data in tests_data['tests']:
|
|
|
|
if 'initialStates' not in test_data:
|
|
|
|
test_data["initialStates"] = ["Data state"]
|
|
|
|
|
|
|
|
for initial_state in test_data["initialStates"]:
|
|
|
|
if initial_state != "Data state":
|
|
|
|
# don't support this yet
|
|
|
|
continue
|
|
|
|
test = make_test(test_data)
|
|
|
|
output_file.write(test)
|
|
|
|
|
|
|
|
output_file.close()
|
|
|
|
|
|
|
|
|
|
|
|
def make_test(test_data):
|
|
|
|
if 'doubleEscaped' in test_data:
|
|
|
|
test_data = test_tokenizer.unescape_test(test_data)
|
|
|
|
|
|
|
|
rv = []
|
|
|
|
rv.append("#data")
|
|
|
|
rv.append(test_data["input"].encode("utf8"))
|
|
|
|
rv.append("#errors")
|
|
|
|
tree = p.parse(test_data["input"])
|
|
|
|
output = p.tree.testSerializer(tree)
|
|
|
|
output = "\n".join(("| " + line[3:]) if line.startswith("| ") else line
|
|
|
|
for line in output.split("\n"))
|
|
|
|
output = unnamespaceExpected(r"\1<\2>", output)
|
|
|
|
rv.append(output.encode("utf8"))
|
|
|
|
rv.append("")
|
|
|
|
return "\n".join(rv)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main(sys.argv[1])
|