bazarr/libs/future/utils/surrogateescape.py

"""
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
handler of Python 3.

Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
"""

# This code is released under the Python license and the BSD 2-clause license

import codecs
import sys

from future import utils


FS_ERRORS = 'surrogateescape'

#     # -- Python 2/3 compatibility -------------------------------------
#     FS_ERRORS = 'my_surrogateescape'

def u(text):
    if utils.PY3:
        return text
    else:
        return text.decode('unicode_escape')

def b(data):
    if utils.PY3:
        return data.encode('latin1')
    else:
        return data

if utils.PY3:
    _unichr = chr
    bytes_chr = lambda code: bytes((code,))
else:
    _unichr = unichr
    bytes_chr = chr

def surrogateescape_handler(exc):
    """
    Pure Python implementation of the PEP 383: the "surrogateescape" error
    handler of Python 3. Undecodable bytes will be replaced by a Unicode
    character U+DCxx on decoding, and these are translated into the
    original bytes on encoding.
    """
    mystring = exc.object[exc.start:exc.end]

    try:
        if isinstance(exc, UnicodeDecodeError):
            # mystring is a byte-string in this case
            decoded = replace_surrogate_decode(mystring)
        elif isinstance(exc, UnicodeEncodeError):
            # In the case of u'\udcc3'.encode('ascii',
            # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
            # exception anyway after this function is called, even though I think
            # it's doing what it should. It seems that the strict encoder is called
            # to encode the unicode string that this function returns ...
            decoded = replace_surrogate_encode(mystring)
        else:
            raise exc
    except NotASurrogateError:
        raise exc
    return (decoded, exc.end)


class NotASurrogateError(Exception):
    pass


def replace_surrogate_encode(mystring):
    """
    Returns a (unicode) string, not the more logical bytes, because the codecs
    register_error functionality expects this.
    """
    decoded = []
    for ch in mystring:
        # if utils.PY3:
        #     code = ch
        # else:
        code = ord(ch)

        # The following magic comes from Py3.3's Python/codecs.c file:
        if not 0xD800 <= code <= 0xDCFF:
            # Not a surrogate. Fail with the original exception.
            raise NotASurrogateError
        # mybytes = [0xe0 | (code >> 12),
        #            0x80 | ((code >> 6) & 0x3f),
        #            0x80 | (code & 0x3f)]
        # Is this a good idea?
        if 0xDC00 <= code <= 0xDC7F:
            decoded.append(_unichr(code - 0xDC00))
        elif code <= 0xDCFF:
            decoded.append(_unichr(code - 0xDC00))
        else:
            raise NotASurrogateError
    return str().join(decoded)


def replace_surrogate_decode(mybytes):
    """
    Returns a (unicode) string
    """
    decoded = []
    for ch in mybytes:
        # We may be parsing newbytes (in which case ch is an int) or a native
        # str on Py2
        if isinstance(ch, int):
            code = ch
        else:
            code = ord(ch)
        if 0x80 <= code <= 0xFF:
            decoded.append(_unichr(0xDC00 + code))
        elif code <= 0x7F:
            decoded.append(_unichr(code))
        else:
            # # It may be a bad byte
            # # Try swallowing it.
            # continue
            # print("RAISE!")
            raise NotASurrogateError
    return str().join(decoded)


def encodefilename(fn):
    if FS_ENCODING == 'ascii':
        # ASCII encoder of Python 2 expects that the error handler returns a
        # Unicode string encodable to ASCII, whereas our surrogateescape error
        # handler has to return bytes in 0x80-0xFF range.
        encoded = []
        for index, ch in enumerate(fn):
            code = ord(ch)
            if code < 128:
                ch = bytes_chr(code)
            elif 0xDC80 <= code <= 0xDCFF:
                ch = bytes_chr(code - 0xDC00)
            else:
                raise UnicodeEncodeError(FS_ENCODING,
                    fn, index, index+1,
                    'ordinal not in range(128)')
            encoded.append(ch)
        return bytes().join(encoded)
    elif FS_ENCODING == 'utf-8':
        # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
        # doesn't go through our error handler
        encoded = []
        for index, ch in enumerate(fn):
            code = ord(ch)
            if 0xD800 <= code <= 0xDFFF:
                if 0xDC80 <= code <= 0xDCFF:
                    ch = bytes_chr(code - 0xDC00)
                    encoded.append(ch)
                else:
                    raise UnicodeEncodeError(
                        FS_ENCODING,
                        fn, index, index+1, 'surrogates not allowed')
            else:
                ch_utf8 = ch.encode('utf-8')
                encoded.append(ch_utf8)
        return bytes().join(encoded)
    else:
        return fn.encode(FS_ENCODING, FS_ERRORS)

def decodefilename(fn):
    return fn.decode(FS_ENCODING, FS_ERRORS)

FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')


# normalize the filesystem encoding name.
# For example, we expect "utf-8", not "UTF8".
FS_ENCODING = codecs.lookup(FS_ENCODING).name


def register_surrogateescape():
    """
    Registers the surrogateescape error handler on Python 2 (only)
    """
    if utils.PY3:
        return
    try:
        codecs.lookup_error(FS_ERRORS)
    except LookupError:
        codecs.register_error(FS_ERRORS, surrogateescape_handler)


if __name__ == '__main__':
    pass
    # # Tests:
    # register_surrogateescape()

    # b = decodefilename(fn)
    # assert b == encoded, "%r != %r" % (b, encoded)
    # c = encodefilename(b)
    # assert c == fn, '%r != %r' % (c, fn)
    # # print("ok")
Subsync first implementation (only after download/upload). 5 years ago			`"""`
			`This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error`
			`handler of Python 3.`

			`Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc`
			`"""`

			`# This code is released under the Python license and the BSD 2-clause license`

			`import codecs`
			`import sys`

			`from future import utils`


			`FS_ERRORS = 'surrogateescape'`

			`# # -- Python 2/3 compatibility -------------------------------------`
			`# FS_ERRORS = 'my_surrogateescape'`

			`def u(text):`
			`if utils.PY3:`
			`return text`
			`else:`
			`return text.decode('unicode_escape')`

			`def b(data):`
			`if utils.PY3:`
			`return data.encode('latin1')`
			`else:`
			`return data`

			`if utils.PY3:`
			`_unichr = chr`
			`bytes_chr = lambda code: bytes((code,))`
			`else:`
			`_unichr = unichr`
			`bytes_chr = chr`

			`def surrogateescape_handler(exc):`
			`"""`
			`Pure Python implementation of the PEP 383: the "surrogateescape" error`
			`handler of Python 3. Undecodable bytes will be replaced by a Unicode`
			`character U+DCxx on decoding, and these are translated into the`
			`original bytes on encoding.`
			`"""`
			`mystring = exc.object[exc.start:exc.end]`

			`try:`
			`if isinstance(exc, UnicodeDecodeError):`
			`# mystring is a byte-string in this case`
			`decoded = replace_surrogate_decode(mystring)`
			`elif isinstance(exc, UnicodeEncodeError):`
			`# In the case of u'\udcc3'.encode('ascii',`
			`# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an`
			`# exception anyway after this function is called, even though I think`
			`# it's doing what it should. It seems that the strict encoder is called`
			`# to encode the unicode string that this function returns ...`
			`decoded = replace_surrogate_encode(mystring)`
			`else:`
			`raise exc`
			`except NotASurrogateError:`
			`raise exc`
			`return (decoded, exc.end)`


			`class NotASurrogateError(Exception):`
			`pass`


			`def replace_surrogate_encode(mystring):`
			`"""`
			`Returns a (unicode) string, not the more logical bytes, because the codecs`
			`register_error functionality expects this.`
			`"""`
			`decoded = []`
			`for ch in mystring:`
			`# if utils.PY3:`
			`# code = ch`
			`# else:`
			`code = ord(ch)`

			`# The following magic comes from Py3.3's Python/codecs.c file:`
			`if not 0xD800 <= code <= 0xDCFF:`
			`# Not a surrogate. Fail with the original exception.`
			`raise NotASurrogateError`
			`# mybytes = [0xe0 \| (code >> 12),`
			`# 0x80 \| ((code >> 6) & 0x3f),`
			`# 0x80 \| (code & 0x3f)]`
			`# Is this a good idea?`
			`if 0xDC00 <= code <= 0xDC7F:`
			`decoded.append(_unichr(code - 0xDC00))`
			`elif code <= 0xDCFF:`
			`decoded.append(_unichr(code - 0xDC00))`
			`else:`
			`raise NotASurrogateError`
			`return str().join(decoded)`


			`def replace_surrogate_decode(mybytes):`
			`"""`
			`Returns a (unicode) string`
			`"""`
			`decoded = []`
			`for ch in mybytes:`
			`# We may be parsing newbytes (in which case ch is an int) or a native`
			`# str on Py2`
			`if isinstance(ch, int):`
			`code = ch`
			`else:`
			`code = ord(ch)`
			`if 0x80 <= code <= 0xFF:`
			`decoded.append(_unichr(0xDC00 + code))`
			`elif code <= 0x7F:`
			`decoded.append(_unichr(code))`
			`else:`
			`# # It may be a bad byte`
			`# # Try swallowing it.`
			`# continue`
			`# print("RAISE!")`
			`raise NotASurrogateError`
			`return str().join(decoded)`


			`def encodefilename(fn):`
			`if FS_ENCODING == 'ascii':`
			`# ASCII encoder of Python 2 expects that the error handler returns a`
			`# Unicode string encodable to ASCII, whereas our surrogateescape error`
			`# handler has to return bytes in 0x80-0xFF range.`
			`encoded = []`
			`for index, ch in enumerate(fn):`
			`code = ord(ch)`
			`if code < 128:`
			`ch = bytes_chr(code)`
			`elif 0xDC80 <= code <= 0xDCFF:`
			`ch = bytes_chr(code - 0xDC00)`
			`else:`
			`raise UnicodeEncodeError(FS_ENCODING,`
			`fn, index, index+1,`
			`'ordinal not in range(128)')`
			`encoded.append(ch)`
			`return bytes().join(encoded)`
			`elif FS_ENCODING == 'utf-8':`
			`# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF`
			`# doesn't go through our error handler`
			`encoded = []`
			`for index, ch in enumerate(fn):`
			`code = ord(ch)`
			`if 0xD800 <= code <= 0xDFFF:`
			`if 0xDC80 <= code <= 0xDCFF:`
			`ch = bytes_chr(code - 0xDC00)`
			`encoded.append(ch)`
			`else:`
			`raise UnicodeEncodeError(`
			`FS_ENCODING,`
			`fn, index, index+1, 'surrogates not allowed')`
			`else:`
			`ch_utf8 = ch.encode('utf-8')`
			`encoded.append(ch_utf8)`
			`return bytes().join(encoded)`
			`else:`
			`return fn.encode(FS_ENCODING, FS_ERRORS)`

			`def decodefilename(fn):`
			`return fn.decode(FS_ENCODING, FS_ERRORS)`

			`FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')`
			`# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')`
			`# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')`


			`# normalize the filesystem encoding name.`
			`# For example, we expect "utf-8", not "UTF8".`
			`FS_ENCODING = codecs.lookup(FS_ENCODING).name`


			`def register_surrogateescape():`
			`"""`
			`Registers the surrogateescape error handler on Python 2 (only)`
			`"""`
			`if utils.PY3:`
			`return`
			`try:`
			`codecs.lookup_error(FS_ERRORS)`
			`except LookupError:`
			`codecs.register_error(FS_ERRORS, surrogateescape_handler)`


			`if __name__ == '__main__':`
			`pass`
			`# # Tests:`
			`# register_surrogateescape()`

			`# b = decodefilename(fn)`
			`# assert b == encoded, "%r != %r" % (b, encoded)`
			`# c = encodefilename(b)`
			`# assert c == fn, '%r != %r' % (c, fn)`
			`# # print("ok")`