You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
139 lines
4.1 KiB
139 lines
4.1 KiB
1 year ago
|
# -*- coding: utf-8 -*-
|
||
|
# vi:tabstop=4:expandtab:sw=4
|
||
|
"""Transliterate Unicode text into plain 7-bit ASCII.
|
||
|
|
||
|
Example usage:
|
||
|
|
||
|
>>> from unidecode import unidecode
|
||
|
>>> unidecode("\u5317\u4EB0")
|
||
|
"Bei Jing "
|
||
|
|
||
|
The transliteration uses a straightforward map, and doesn't have alternatives
|
||
|
for the same character based on language, position, or anything else.
|
||
|
|
||
|
A standard string object will be returned. If you need bytes, use:
|
||
|
|
||
|
>>> unidecode("Κνωσός").encode("ascii")
|
||
|
b'Knosos'
|
||
|
"""
|
||
|
import warnings
|
||
|
from typing import Dict, Optional, Sequence
|
||
|
|
||
|
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
|
||
|
|
||
|
class UnidecodeError(ValueError):
|
||
|
def __init__(self, message: str, index: Optional[int] = None) -> None:
|
||
|
"""Raised for Unidecode-related errors.
|
||
|
|
||
|
The index attribute contains the index of the character that caused
|
||
|
the error.
|
||
|
"""
|
||
|
super(UnidecodeError, self).__init__(message)
|
||
|
self.index = index
|
||
|
|
||
|
|
||
|
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||
|
"""Transliterate an Unicode object into an ASCII string
|
||
|
|
||
|
>>> unidecode("\u5317\u4EB0")
|
||
|
"Bei Jing "
|
||
|
|
||
|
This function first tries to convert the string using ASCII codec.
|
||
|
If it fails (because of non-ASCII characters), it falls back to
|
||
|
transliteration using the character tables.
|
||
|
|
||
|
This is approx. five times faster if the string only contains ASCII
|
||
|
characters, but slightly slower than unicode_expect_nonascii if
|
||
|
non-ASCII characters are present.
|
||
|
|
||
|
errors specifies what to do with characters that have not been
|
||
|
found in replacement tables. The default is 'ignore' which ignores
|
||
|
the character. 'strict' raises an UnidecodeError. 'replace'
|
||
|
substitutes the character with replace_str (default is '?').
|
||
|
'preserve' keeps the original character.
|
||
|
|
||
|
Note that if 'preserve' is used the returned string might not be
|
||
|
ASCII!
|
||
|
"""
|
||
|
|
||
|
try:
|
||
|
bytestring = string.encode('ASCII')
|
||
|
except UnicodeEncodeError:
|
||
|
pass
|
||
|
else:
|
||
|
return string
|
||
|
|
||
|
return _unidecode(string, errors, replace_str)
|
||
|
|
||
|
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
||
|
"""Transliterate an Unicode object into an ASCII string
|
||
|
|
||
|
>>> unidecode("\u5317\u4EB0")
|
||
|
"Bei Jing "
|
||
|
|
||
|
See unidecode_expect_ascii.
|
||
|
"""
|
||
|
|
||
|
return _unidecode(string, errors, replace_str)
|
||
|
|
||
|
unidecode = unidecode_expect_ascii
|
||
|
|
||
|
def _get_repl_str(char: str) -> Optional[str]:
|
||
|
codepoint = ord(char)
|
||
|
|
||
|
if codepoint < 0x80:
|
||
|
# Already ASCII
|
||
|
return str(char)
|
||
|
|
||
|
if codepoint > 0xeffff:
|
||
|
# No data on characters in Private Use Area and above.
|
||
|
return None
|
||
|
|
||
|
if 0xd800 <= codepoint <= 0xdfff:
|
||
|
warnings.warn( "Surrogate character %r will be ignored. "
|
||
|
"You might be using a narrow Python build." % (char,),
|
||
|
RuntimeWarning, 2)
|
||
|
|
||
|
section = codepoint >> 8 # Chop off the last two hex digits
|
||
|
position = codepoint % 256 # Last two hex digits
|
||
|
|
||
|
try:
|
||
|
table = Cache[section]
|
||
|
except KeyError:
|
||
|
try:
|
||
|
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
|
||
|
except ImportError:
|
||
|
# No data on this character
|
||
|
Cache[section] = None
|
||
|
return None
|
||
|
|
||
|
Cache[section] = table = mod.data
|
||
|
|
||
|
if table and len(table) > position:
|
||
|
return table[position]
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
def _unidecode(string: str, errors: str, replace_str:str) -> str:
|
||
|
retval = []
|
||
|
|
||
|
for index, char in enumerate(string):
|
||
|
repl = _get_repl_str(char)
|
||
|
|
||
|
if repl is None:
|
||
|
if errors == 'ignore':
|
||
|
repl = ''
|
||
|
elif errors == 'strict':
|
||
|
raise UnidecodeError('no replacement found for character %r '
|
||
|
'in position %d' % (char, index), index)
|
||
|
elif errors == 'replace':
|
||
|
repl = replace_str
|
||
|
elif errors == 'preserve':
|
||
|
repl = char
|
||
|
else:
|
||
|
raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
|
||
|
|
||
|
retval.append(repl)
|
||
|
|
||
|
return ''.join(retval)
|