|
|
"""
|
|
|
A script to make the char_classes.dat file.
|
|
|
|
|
|
This never needs to run in normal usage. It needs to be run if the character
|
|
|
classes we care about change, or if a new version of Python supports a new
|
|
|
Unicode standard and we want it to affect our string decoding.
|
|
|
|
|
|
The file that we generate is based on Unicode 9.0, as supported by Python 3.6.
|
|
|
You can certainly use it in earlier versions. This simply makes sure that we
|
|
|
get consistent results from running ftfy on different versions of Python.
|
|
|
|
|
|
The file will be written to the current directory.
|
|
|
"""
|
|
|
from __future__ import unicode_literals
|
|
|
import unicodedata
|
|
|
import sys
|
|
|
import zlib
|
|
|
if sys.hexversion >= 0x03000000:
|
|
|
unichr = chr
|
|
|
|
|
|
# L = Latin capital letter
|
|
|
# l = Latin lowercase letter
|
|
|
# A = Non-latin capital or title-case letter
|
|
|
# a = Non-latin lowercase letter
|
|
|
# C = Non-cased letter (Lo)
|
|
|
# X = Control character (Cc)
|
|
|
# m = Letter modifier (Lm)
|
|
|
# M = Mark (Mc, Me, Mn)
|
|
|
# N = Miscellaneous numbers (No)
|
|
|
# P = Private use (Co)
|
|
|
# 1 = Math symbol (Sm) or currency symbol (Sc)
|
|
|
# 2 = Symbol modifier (Sk)
|
|
|
# 3 = Other symbol (So)
|
|
|
# S = UTF-16 surrogate
|
|
|
# _ = Unassigned character
|
|
|
# = Whitespace
|
|
|
# o = Other
|
|
|
|
|
|
|
|
|
def make_char_data_file(do_it_anyway=False):
|
|
|
"""
|
|
|
Build the compressed data file 'char_classes.dat' and write it to the
|
|
|
current directory.
|
|
|
|
|
|
If you run this, run it in Python 3.6 or later. It will run in earlier
|
|
|
versions, but you won't get the Unicode 9 standard, leading to inconsistent
|
|
|
behavior.
|
|
|
|
|
|
To protect against this, running this in the wrong version of Python will
|
|
|
raise an error unless you pass `do_it_anyway=True`.
|
|
|
"""
|
|
|
if sys.hexversion < 0x03060000 and not do_it_anyway:
|
|
|
raise RuntimeError(
|
|
|
"This function should be run in Python 3.6 or later."
|
|
|
)
|
|
|
|
|
|
cclasses = [None] * 0x110000
|
|
|
for codepoint in range(0x0, 0x110000):
|
|
|
char = unichr(codepoint)
|
|
|
category = unicodedata.category(char)
|
|
|
|
|
|
if (0x250 <= codepoint < 0x300) and char != 'ə':
|
|
|
# IPA symbols and modifiers.
|
|
|
#
|
|
|
# This category excludes the schwa (ə), which is used as a normal
|
|
|
# Latin letter in some languages.
|
|
|
cclasses[codepoint] = 'i'
|
|
|
elif category.startswith('L'): # letters
|
|
|
if unicodedata.name(char, '').startswith('LATIN'):
|
|
|
if category == 'Lu':
|
|
|
cclasses[codepoint] = 'L'
|
|
|
else:
|
|
|
cclasses[codepoint] = 'l'
|
|
|
else:
|
|
|
if category == 'Lu' or category == 'Lt':
|
|
|
cclasses[codepoint] = 'A'
|
|
|
elif category == 'Ll':
|
|
|
cclasses[codepoint] = 'a'
|
|
|
elif category == 'Lo':
|
|
|
cclasses[codepoint] = 'C'
|
|
|
elif category == 'Lm':
|
|
|
cclasses[codepoint] = 'm'
|
|
|
else:
|
|
|
raise ValueError('got some weird kind of letter')
|
|
|
elif 0xfe00 <= codepoint <= 0xfe0f or 0x1f3fb <= codepoint <= 0x1f3ff:
|
|
|
# Variation selectors and skin-tone modifiers have the category
|
|
|
# of non-spacing marks, but they act like symbols
|
|
|
cclasses[codepoint] = '3'
|
|
|
elif category.startswith('M'): # marks
|
|
|
cclasses[codepoint] = 'M'
|
|
|
elif category == 'No':
|
|
|
cclasses[codepoint] = 'N'
|
|
|
elif category == 'Sm' or category == 'Sc':
|
|
|
cclasses[codepoint] = '1'
|
|
|
elif category == 'Sk':
|
|
|
cclasses[codepoint] = '2'
|
|
|
elif category == 'So':
|
|
|
cclasses[codepoint] = '3'
|
|
|
elif category == 'Cc':
|
|
|
cclasses[codepoint] = 'X'
|
|
|
elif category == 'Cs':
|
|
|
cclasses[codepoint] = 'S'
|
|
|
elif category == 'Co':
|
|
|
cclasses[codepoint] = 'P'
|
|
|
elif category.startswith('Z'):
|
|
|
cclasses[codepoint] = ' '
|
|
|
elif 0x1f000 <= codepoint <= 0x1ffff:
|
|
|
# This range is rapidly having emoji added to it. Assume that
|
|
|
# an unassigned codepoint in this range is just a symbol we
|
|
|
# don't know yet.
|
|
|
cclasses[codepoint] = '3'
|
|
|
elif category == 'Cn':
|
|
|
cclasses[codepoint] = '_'
|
|
|
else:
|
|
|
cclasses[codepoint] = 'o'
|
|
|
|
|
|
# Mark whitespace control characters as whitespace
|
|
|
cclasses[9] = cclasses[10] = cclasses[12] = cclasses[13] = ' '
|
|
|
|
|
|
# Some other exceptions for characters that are more commonly used as
|
|
|
# punctuation or decoration than for their ostensible purpose.
|
|
|
# For example, tilde is not usually a "math symbol", and the accents
|
|
|
# `´ are much more like quotation marks than modifiers.
|
|
|
for char in "^~`´˝^`":
|
|
|
cclasses[ord(char)] = 'o'
|
|
|
|
|
|
out = open('char_classes.dat', 'wb')
|
|
|
out.write(zlib.compress(''.join(cclasses).encode('ascii')))
|
|
|
out.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
make_char_data_file()
|