You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
314 lines
9.2 KiB
314 lines
9.2 KiB
6 years ago
|
# -*- coding: utf-8 -*-
|
||
|
# config.py
|
||
|
# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors
|
||
|
#
|
||
|
# This module is part of GitPython and is released under
|
||
|
# the BSD License: http://www.opensource.org/licenses/bsd-license.php
|
||
|
"""utilities to help provide compatibility with python 3"""
|
||
|
# flake8: noqa
|
||
|
|
||
|
import locale
|
||
|
import os
|
||
|
import sys
|
||
|
import codecs
|
||
|
|
||
|
|
||
|
from gitdb.utils.compat import (
|
||
|
xrange,
|
||
|
MAXSIZE, # @UnusedImport
|
||
|
izip, # @UnusedImport
|
||
|
)
|
||
|
from gitdb.utils.encoding import (
|
||
|
string_types, # @UnusedImport
|
||
|
text_type, # @UnusedImport
|
||
|
force_bytes, # @UnusedImport
|
||
|
force_text # @UnusedImport
|
||
|
)
|
||
|
|
||
|
|
||
|
PY3 = sys.version_info[0] >= 3
|
||
|
is_win = (os.name == 'nt')
|
||
|
is_posix = (os.name == 'posix')
|
||
|
is_darwin = (os.name == 'darwin')
|
||
|
defenc = sys.getdefaultencoding()
|
||
|
|
||
|
if PY3:
|
||
|
import io
|
||
|
FileType = io.IOBase
|
||
|
|
||
|
def byte_ord(b):
|
||
|
return b
|
||
|
|
||
|
def bchr(n):
|
||
|
return bytes([n])
|
||
|
|
||
|
def mviter(d):
|
||
|
return d.values()
|
||
|
|
||
|
range = xrange # @ReservedAssignment
|
||
|
unicode = str
|
||
|
binary_type = bytes
|
||
|
else:
|
||
|
FileType = file # @UndefinedVariable on PY3
|
||
|
# usually, this is just ascii, which might not enough for our encoding needs
|
||
|
# Unless it's set specifically, we override it to be utf-8
|
||
|
if defenc == 'ascii':
|
||
|
defenc = 'utf-8'
|
||
|
byte_ord = ord
|
||
|
bchr = chr
|
||
|
unicode = unicode
|
||
|
binary_type = str
|
||
|
range = xrange # @ReservedAssignment
|
||
|
|
||
|
def mviter(d):
|
||
|
return d.itervalues()
|
||
|
|
||
|
|
||
|
def safe_decode(s):
|
||
|
"""Safely decodes a binary string to unicode"""
|
||
|
if isinstance(s, unicode):
|
||
|
return s
|
||
|
elif isinstance(s, bytes):
|
||
|
return s.decode(defenc, 'surrogateescape')
|
||
|
elif s is not None:
|
||
|
raise TypeError('Expected bytes or text, but got %r' % (s,))
|
||
|
|
||
|
|
||
|
def safe_encode(s):
|
||
|
"""Safely decodes a binary string to unicode"""
|
||
|
if isinstance(s, unicode):
|
||
|
return s.encode(defenc)
|
||
|
elif isinstance(s, bytes):
|
||
|
return s
|
||
|
elif s is not None:
|
||
|
raise TypeError('Expected bytes or text, but got %r' % (s,))
|
||
|
|
||
|
|
||
|
def win_encode(s):
|
||
|
"""Encode unicodes for process arguments on Windows."""
|
||
|
if isinstance(s, unicode):
|
||
|
return s.encode(locale.getpreferredencoding(False))
|
||
|
elif isinstance(s, bytes):
|
||
|
return s
|
||
|
elif s is not None:
|
||
|
raise TypeError('Expected bytes or text, but got %r' % (s,))
|
||
|
|
||
|
|
||
|
def with_metaclass(meta, *bases):
|
||
|
"""copied from https://github.com/Byron/bcore/blob/master/src/python/butility/future.py#L15"""
|
||
|
class metaclass(meta):
|
||
|
__call__ = type.__call__
|
||
|
__init__ = type.__init__
|
||
|
|
||
|
def __new__(cls, name, nbases, d):
|
||
|
if nbases is None:
|
||
|
return type.__new__(cls, name, (), d)
|
||
|
# There may be clients who rely on this attribute to be set to a reasonable value, which is why
|
||
|
# we set the __metaclass__ attribute explicitly
|
||
|
if not PY3 and '___metaclass__' not in d:
|
||
|
d['__metaclass__'] = meta
|
||
|
return meta(name, bases, d)
|
||
|
return metaclass(meta.__name__ + 'Helper', None, {})
|
||
|
|
||
|
|
||
|
## From https://docs.python.org/3.3/howto/pyporting.html
|
||
|
class UnicodeMixin(object):
|
||
|
|
||
|
"""Mixin class to handle defining the proper __str__/__unicode__
|
||
|
methods in Python 2 or 3."""
|
||
|
|
||
|
if PY3:
|
||
|
def __str__(self):
|
||
|
return self.__unicode__()
|
||
|
else: # Python 2
|
||
|
def __str__(self):
|
||
|
return self.__unicode__().encode(defenc)
|
||
|
|
||
|
|
||
|
"""
|
||
|
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
|
||
|
handler of Python 3.
|
||
|
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
|
||
|
"""
|
||
|
|
||
|
# This code is released under the Python license and the BSD 2-clause license
|
||
|
|
||
|
|
||
|
FS_ERRORS = 'surrogateescape'
|
||
|
|
||
|
# # -- Python 2/3 compatibility -------------------------------------
|
||
|
# FS_ERRORS = 'my_surrogateescape'
|
||
|
|
||
|
def u(text):
|
||
|
if PY3:
|
||
|
return text
|
||
|
else:
|
||
|
return text.decode('unicode_escape')
|
||
|
|
||
|
def b(data):
|
||
|
if PY3:
|
||
|
return data.encode('latin1')
|
||
|
else:
|
||
|
return data
|
||
|
|
||
|
if PY3:
|
||
|
_unichr = chr
|
||
|
bytes_chr = lambda code: bytes((code,))
|
||
|
else:
|
||
|
_unichr = unichr
|
||
|
bytes_chr = chr
|
||
|
|
||
|
def surrogateescape_handler(exc):
|
||
|
"""
|
||
|
Pure Python implementation of the PEP 383: the "surrogateescape" error
|
||
|
handler of Python 3. Undecodable bytes will be replaced by a Unicode
|
||
|
character U+DCxx on decoding, and these are translated into the
|
||
|
original bytes on encoding.
|
||
|
"""
|
||
|
mystring = exc.object[exc.start:exc.end]
|
||
|
|
||
|
try:
|
||
|
if isinstance(exc, UnicodeDecodeError):
|
||
|
# mystring is a byte-string in this case
|
||
|
decoded = replace_surrogate_decode(mystring)
|
||
|
elif isinstance(exc, UnicodeEncodeError):
|
||
|
# In the case of u'\udcc3'.encode('ascii',
|
||
|
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
|
||
|
# exception anyway after this function is called, even though I think
|
||
|
# it's doing what it should. It seems that the strict encoder is called
|
||
|
# to encode the unicode string that this function returns ...
|
||
|
decoded = replace_surrogate_encode(mystring, exc)
|
||
|
else:
|
||
|
raise exc
|
||
|
except NotASurrogateError:
|
||
|
raise exc
|
||
|
return (decoded, exc.end)
|
||
|
|
||
|
|
||
|
class NotASurrogateError(Exception):
|
||
|
pass
|
||
|
|
||
|
|
||
|
def replace_surrogate_encode(mystring, exc):
|
||
|
"""
|
||
|
Returns a (unicode) string, not the more logical bytes, because the codecs
|
||
|
register_error functionality expects this.
|
||
|
"""
|
||
|
decoded = []
|
||
|
for ch in mystring:
|
||
|
# if PY3:
|
||
|
# code = ch
|
||
|
# else:
|
||
|
code = ord(ch)
|
||
|
|
||
|
# The following magic comes from Py3.3's Python/codecs.c file:
|
||
|
if not 0xD800 <= code <= 0xDCFF:
|
||
|
# Not a surrogate. Fail with the original exception.
|
||
|
raise exc
|
||
|
# mybytes = [0xe0 | (code >> 12),
|
||
|
# 0x80 | ((code >> 6) & 0x3f),
|
||
|
# 0x80 | (code & 0x3f)]
|
||
|
# Is this a good idea?
|
||
|
if 0xDC00 <= code <= 0xDC7F:
|
||
|
decoded.append(_unichr(code - 0xDC00))
|
||
|
elif code <= 0xDCFF:
|
||
|
decoded.append(_unichr(code - 0xDC00))
|
||
|
else:
|
||
|
raise NotASurrogateError
|
||
|
return str().join(decoded)
|
||
|
|
||
|
|
||
|
def replace_surrogate_decode(mybytes):
|
||
|
"""
|
||
|
Returns a (unicode) string
|
||
|
"""
|
||
|
decoded = []
|
||
|
for ch in mybytes:
|
||
|
# We may be parsing newbytes (in which case ch is an int) or a native
|
||
|
# str on Py2
|
||
|
if isinstance(ch, int):
|
||
|
code = ch
|
||
|
else:
|
||
|
code = ord(ch)
|
||
|
if 0x80 <= code <= 0xFF:
|
||
|
decoded.append(_unichr(0xDC00 + code))
|
||
|
elif code <= 0x7F:
|
||
|
decoded.append(_unichr(code))
|
||
|
else:
|
||
|
# # It may be a bad byte
|
||
|
# # Try swallowing it.
|
||
|
# continue
|
||
|
# print("RAISE!")
|
||
|
raise NotASurrogateError
|
||
|
return str().join(decoded)
|
||
|
|
||
|
|
||
|
def encodefilename(fn):
|
||
|
if FS_ENCODING == 'ascii':
|
||
|
# ASCII encoder of Python 2 expects that the error handler returns a
|
||
|
# Unicode string encodable to ASCII, whereas our surrogateescape error
|
||
|
# handler has to return bytes in 0x80-0xFF range.
|
||
|
encoded = []
|
||
|
for index, ch in enumerate(fn):
|
||
|
code = ord(ch)
|
||
|
if code < 128:
|
||
|
ch = bytes_chr(code)
|
||
|
elif 0xDC80 <= code <= 0xDCFF:
|
||
|
ch = bytes_chr(code - 0xDC00)
|
||
|
else:
|
||
|
raise UnicodeEncodeError(FS_ENCODING,
|
||
|
fn, index, index+1,
|
||
|
'ordinal not in range(128)')
|
||
|
encoded.append(ch)
|
||
|
return bytes().join(encoded)
|
||
|
elif FS_ENCODING == 'utf-8':
|
||
|
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
|
||
|
# doesn't go through our error handler
|
||
|
encoded = []
|
||
|
for index, ch in enumerate(fn):
|
||
|
code = ord(ch)
|
||
|
if 0xD800 <= code <= 0xDFFF:
|
||
|
if 0xDC80 <= code <= 0xDCFF:
|
||
|
ch = bytes_chr(code - 0xDC00)
|
||
|
encoded.append(ch)
|
||
|
else:
|
||
|
raise UnicodeEncodeError(
|
||
|
FS_ENCODING,
|
||
|
fn, index, index+1, 'surrogates not allowed')
|
||
|
else:
|
||
|
ch_utf8 = ch.encode('utf-8')
|
||
|
encoded.append(ch_utf8)
|
||
|
return bytes().join(encoded)
|
||
|
else:
|
||
|
return fn.encode(FS_ENCODING, FS_ERRORS)
|
||
|
|
||
|
def decodefilename(fn):
|
||
|
return fn.decode(FS_ENCODING, FS_ERRORS)
|
||
|
|
||
|
FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
|
||
|
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
|
||
|
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
|
||
|
|
||
|
|
||
|
# normalize the filesystem encoding name.
|
||
|
# For example, we expect "utf-8", not "UTF8".
|
||
|
FS_ENCODING = codecs.lookup(FS_ENCODING).name
|
||
|
|
||
|
|
||
|
def register_surrogateescape():
|
||
|
"""
|
||
|
Registers the surrogateescape error handler on Python 2 (only)
|
||
|
"""
|
||
|
if PY3:
|
||
|
return
|
||
|
try:
|
||
|
codecs.lookup_error(FS_ERRORS)
|
||
|
except LookupError:
|
||
|
codecs.register_error(FS_ERRORS, surrogateescape_handler)
|
||
|
|
||
|
|
||
|
try:
|
||
|
b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
|
||
|
except Exception:
|
||
|
register_surrogateescape()
|