bazarr/libs/unidecode/__init__.py

# -*- coding: utf-8 -*-
# vi:tabstop=4:expandtab:sw=4
"""Transliterate Unicode text into plain 7-bit ASCII.

Example usage:

>>> from unidecode import unidecode
>>> unidecode("\u5317\u4EB0")
"Bei Jing "

The transliteration uses a straightforward map, and doesn't have alternatives
for the same character based on language, position, or anything else.

A standard string object will be returned. If you need bytes, use:

>>> unidecode("Κνωσός").encode("ascii")
b'Knosos'
"""
import warnings
from typing import Dict, Optional, Sequence

Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]

class UnidecodeError(ValueError):
    def __init__(self, message: str, index: Optional[int] = None) -> None:
        """Raised for Unidecode-related errors.

        The index attribute contains the index of the character that caused
        the error.
        """
        super(UnidecodeError, self).__init__(message)
        self.index = index


def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
    """Transliterate an Unicode object into an ASCII string

    >>> unidecode("\u5317\u4EB0")
    "Bei Jing "

    This function first tries to convert the string using ASCII codec.
    If it fails (because of non-ASCII characters), it falls back to
    transliteration using the character tables.

    This is approx. five times faster if the string only contains ASCII
    characters, but slightly slower than unicode_expect_nonascii if
    non-ASCII characters are present.

    errors specifies what to do with characters that have not been
    found in replacement tables. The default is 'ignore' which ignores
    the character. 'strict' raises an UnidecodeError. 'replace'
    substitutes the character with replace_str (default is '?').
    'preserve' keeps the original character.

    Note that if 'preserve' is used the returned string might not be
    ASCII!
    """

    try:
        bytestring = string.encode('ASCII')
    except UnicodeEncodeError:
        pass
    else:
        return string

    return _unidecode(string, errors, replace_str)

def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
    """Transliterate an Unicode object into an ASCII string

    >>> unidecode("\u5317\u4EB0")
    "Bei Jing "

    See unidecode_expect_ascii.
    """

    return _unidecode(string, errors, replace_str)

unidecode = unidecode_expect_ascii

def _get_repl_str(char: str) -> Optional[str]:
    codepoint = ord(char)

    if codepoint < 0x80:
        # Already ASCII
        return str(char)

    if codepoint > 0xeffff:
        # No data on characters in Private Use Area and above.
        return None

    if 0xd800 <= codepoint <= 0xdfff:
        warnings.warn(  "Surrogate character %r will be ignored. "
                        "You might be using a narrow Python build." % (char,),
                        RuntimeWarning, 2)

    section = codepoint >> 8   # Chop off the last two hex digits
    position = codepoint % 256 # Last two hex digits

    try:
        table = Cache[section]
    except KeyError:
        try:
            mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
        except ImportError:
            # No data on this character
            Cache[section] = None
            return None

        Cache[section] = table = mod.data

    if table and len(table) > position:
        return table[position]
    else:
        return None

def _unidecode(string: str, errors: str, replace_str:str) -> str:
    retval = []

    for index, char in enumerate(string):
        repl = _get_repl_str(char)

        if repl is None:
            if errors == 'ignore':
                repl = ''
            elif errors == 'strict':
                raise UnidecodeError('no replacement found for character %r '
                        'in position %d' % (char, index), index)
            elif errors == 'replace':
                repl = replace_str
            elif errors == 'preserve':
                repl = char
            else:
                raise UnidecodeError('invalid value for errors parameter %r' % (errors,))

        retval.append(repl)

    return ''.join(retval)