You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
427 lines
15 KiB
427 lines
15 KiB
"""
|
|
This module redefines ``str`` on Python 2.x to be a subclass of the Py2
|
|
``unicode`` type that behaves like the Python 3.x ``str``.
|
|
|
|
The main differences between ``newstr`` and Python 2.x's ``unicode`` type are
|
|
the stricter type-checking and absence of a `u''` prefix in the representation.
|
|
|
|
It is designed to be used together with the ``unicode_literals`` import
|
|
as follows:
|
|
|
|
>>> from __future__ import unicode_literals
|
|
>>> from builtins import str, isinstance
|
|
|
|
On Python 3.x and normally on Python 2.x, these expressions hold
|
|
|
|
>>> str('blah') is 'blah'
|
|
True
|
|
>>> isinstance('blah', str)
|
|
True
|
|
|
|
However, on Python 2.x, with this import:
|
|
|
|
>>> from __future__ import unicode_literals
|
|
|
|
the same expressions are False:
|
|
|
|
>>> str('blah') is 'blah'
|
|
False
|
|
>>> isinstance('blah', str)
|
|
False
|
|
|
|
This module is designed to be imported together with ``unicode_literals`` on
|
|
Python 2 to bring the meaning of ``str`` back into alignment with unprefixed
|
|
string literals (i.e. ``unicode`` subclasses).
|
|
|
|
Note that ``str()`` (and ``print()``) would then normally call the
|
|
``__unicode__`` method on objects in Python 2. To define string
|
|
representations of your objects portably across Py3 and Py2, use the
|
|
:func:`python_2_unicode_compatible` decorator in :mod:`future.utils`.
|
|
|
|
"""
|
|
|
|
from numbers import Number
|
|
|
|
from future.utils import PY3, istext, with_metaclass, isnewbytes
|
|
from future.types import no, issubset
|
|
from future.types.newobject import newobject
|
|
|
|
|
|
if PY3:
|
|
# We'll probably never use newstr on Py3 anyway...
|
|
unicode = str
|
|
from collections.abc import Iterable
|
|
else:
|
|
from collections import Iterable
|
|
|
|
|
|
class BaseNewStr(type):
|
|
def __instancecheck__(cls, instance):
|
|
if cls == newstr:
|
|
return isinstance(instance, unicode)
|
|
else:
|
|
return issubclass(instance.__class__, cls)
|
|
|
|
|
|
class newstr(with_metaclass(BaseNewStr, unicode)):
|
|
"""
|
|
A backport of the Python 3 str object to Py2
|
|
"""
|
|
no_convert_msg = "Can't convert '{0}' object to str implicitly"
|
|
|
|
def __new__(cls, *args, **kwargs):
|
|
"""
|
|
From the Py3 str docstring:
|
|
|
|
str(object='') -> str
|
|
str(bytes_or_buffer[, encoding[, errors]]) -> str
|
|
|
|
Create a new string object from the given object. If encoding or
|
|
errors is specified, then the object must expose a data buffer
|
|
that will be decoded using the given encoding and error handler.
|
|
Otherwise, returns the result of object.__str__() (if defined)
|
|
or repr(object).
|
|
encoding defaults to sys.getdefaultencoding().
|
|
errors defaults to 'strict'.
|
|
|
|
"""
|
|
if len(args) == 0:
|
|
return super(newstr, cls).__new__(cls)
|
|
# Special case: If someone requests str(str(u'abc')), return the same
|
|
# object (same id) for consistency with Py3.3. This is not true for
|
|
# other objects like list or dict.
|
|
elif type(args[0]) == newstr and cls == newstr:
|
|
return args[0]
|
|
elif isinstance(args[0], unicode):
|
|
value = args[0]
|
|
elif isinstance(args[0], bytes): # i.e. Py2 bytes or newbytes
|
|
if 'encoding' in kwargs or len(args) > 1:
|
|
value = args[0].decode(*args[1:], **kwargs)
|
|
else:
|
|
value = args[0].__str__()
|
|
else:
|
|
value = args[0]
|
|
return super(newstr, cls).__new__(cls, value)
|
|
|
|
def __repr__(self):
|
|
"""
|
|
Without the u prefix
|
|
"""
|
|
|
|
value = super(newstr, self).__repr__()
|
|
# assert value[0] == u'u'
|
|
return value[1:]
|
|
|
|
def __getitem__(self, y):
|
|
"""
|
|
Warning: Python <= 2.7.6 has a bug that causes this method never to be called
|
|
when y is a slice object. Therefore the type of newstr()[:2] is wrong
|
|
(unicode instead of newstr).
|
|
"""
|
|
return newstr(super(newstr, self).__getitem__(y))
|
|
|
|
def __contains__(self, key):
|
|
errmsg = "'in <string>' requires string as left operand, not {0}"
|
|
# Don't use isinstance() here because we only want to catch
|
|
# newstr, not Python 2 unicode:
|
|
if type(key) == newstr:
|
|
newkey = key
|
|
elif isinstance(key, unicode) or isinstance(key, bytes) and not isnewbytes(key):
|
|
newkey = newstr(key)
|
|
else:
|
|
raise TypeError(errmsg.format(type(key)))
|
|
return issubset(list(newkey), list(self))
|
|
|
|
@no('newbytes')
|
|
def __add__(self, other):
|
|
return newstr(super(newstr, self).__add__(other))
|
|
|
|
@no('newbytes')
|
|
def __radd__(self, left):
|
|
" left + self "
|
|
try:
|
|
return newstr(left) + self
|
|
except:
|
|
return NotImplemented
|
|
|
|
def __mul__(self, other):
|
|
return newstr(super(newstr, self).__mul__(other))
|
|
|
|
def __rmul__(self, other):
|
|
return newstr(super(newstr, self).__rmul__(other))
|
|
|
|
def join(self, iterable):
|
|
errmsg = 'sequence item {0}: expected unicode string, found bytes'
|
|
for i, item in enumerate(iterable):
|
|
# Here we use type() rather than isinstance() because
|
|
# __instancecheck__ is being overridden. E.g.
|
|
# isinstance(b'abc', newbytes) is True on Py2.
|
|
if isnewbytes(item):
|
|
raise TypeError(errmsg.format(i))
|
|
# Support use as a staticmethod: str.join('-', ['a', 'b'])
|
|
if type(self) == newstr:
|
|
return newstr(super(newstr, self).join(iterable))
|
|
else:
|
|
return newstr(super(newstr, newstr(self)).join(iterable))
|
|
|
|
@no('newbytes')
|
|
def find(self, sub, *args):
|
|
return super(newstr, self).find(sub, *args)
|
|
|
|
@no('newbytes')
|
|
def rfind(self, sub, *args):
|
|
return super(newstr, self).rfind(sub, *args)
|
|
|
|
@no('newbytes', (1, 2))
|
|
def replace(self, old, new, *args):
|
|
return newstr(super(newstr, self).replace(old, new, *args))
|
|
|
|
def decode(self, *args):
|
|
raise AttributeError("decode method has been disabled in newstr")
|
|
|
|
def encode(self, encoding='utf-8', errors='strict'):
|
|
"""
|
|
Returns bytes
|
|
|
|
Encode S using the codec registered for encoding. Default encoding
|
|
is 'utf-8'. errors may be given to set a different error
|
|
handling scheme. Default is 'strict' meaning that encoding errors raise
|
|
a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
|
|
'xmlcharrefreplace' as well as any other name registered with
|
|
codecs.register_error that can handle UnicodeEncodeErrors.
|
|
"""
|
|
from future.types.newbytes import newbytes
|
|
# Py2 unicode.encode() takes encoding and errors as optional parameter,
|
|
# not keyword arguments as in Python 3 str.
|
|
|
|
# For the surrogateescape error handling mechanism, the
|
|
# codecs.register_error() function seems to be inadequate for an
|
|
# implementation of it when encoding. (Decoding seems fine, however.)
|
|
# For example, in the case of
|
|
# u'\udcc3'.encode('ascii', 'surrogateescape_handler')
|
|
# after registering the ``surrogateescape_handler`` function in
|
|
# future.utils.surrogateescape, both Python 2.x and 3.x raise an
|
|
# exception anyway after the function is called because the unicode
|
|
# string it has to return isn't encodable strictly as ASCII.
|
|
|
|
if errors == 'surrogateescape':
|
|
if encoding == 'utf-16':
|
|
# Known to fail here. See test_encoding_works_normally()
|
|
raise NotImplementedError('FIXME: surrogateescape handling is '
|
|
'not yet implemented properly')
|
|
# Encode char by char, building up list of byte-strings
|
|
mybytes = []
|
|
for c in self:
|
|
code = ord(c)
|
|
if 0xD800 <= code <= 0xDCFF:
|
|
mybytes.append(newbytes([code - 0xDC00]))
|
|
else:
|
|
mybytes.append(c.encode(encoding=encoding))
|
|
return newbytes(b'').join(mybytes)
|
|
return newbytes(super(newstr, self).encode(encoding, errors))
|
|
|
|
@no('newbytes', 1)
|
|
def startswith(self, prefix, *args):
|
|
if isinstance(prefix, Iterable):
|
|
for thing in prefix:
|
|
if isnewbytes(thing):
|
|
raise TypeError(self.no_convert_msg.format(type(thing)))
|
|
return super(newstr, self).startswith(prefix, *args)
|
|
|
|
@no('newbytes', 1)
|
|
def endswith(self, prefix, *args):
|
|
# Note we need the decorator above as well as the isnewbytes()
|
|
# check because prefix can be either a bytes object or e.g. a
|
|
# tuple of possible prefixes. (If it's a bytes object, each item
|
|
# in it is an int.)
|
|
if isinstance(prefix, Iterable):
|
|
for thing in prefix:
|
|
if isnewbytes(thing):
|
|
raise TypeError(self.no_convert_msg.format(type(thing)))
|
|
return super(newstr, self).endswith(prefix, *args)
|
|
|
|
@no('newbytes', 1)
|
|
def split(self, sep=None, maxsplit=-1):
|
|
# Py2 unicode.split() takes maxsplit as an optional parameter,
|
|
# not as a keyword argument as in Python 3 str.
|
|
parts = super(newstr, self).split(sep, maxsplit)
|
|
return [newstr(part) for part in parts]
|
|
|
|
@no('newbytes', 1)
|
|
def rsplit(self, sep=None, maxsplit=-1):
|
|
# Py2 unicode.rsplit() takes maxsplit as an optional parameter,
|
|
# not as a keyword argument as in Python 3 str.
|
|
parts = super(newstr, self).rsplit(sep, maxsplit)
|
|
return [newstr(part) for part in parts]
|
|
|
|
@no('newbytes', 1)
|
|
def partition(self, sep):
|
|
parts = super(newstr, self).partition(sep)
|
|
return tuple(newstr(part) for part in parts)
|
|
|
|
@no('newbytes', 1)
|
|
def rpartition(self, sep):
|
|
parts = super(newstr, self).rpartition(sep)
|
|
return tuple(newstr(part) for part in parts)
|
|
|
|
@no('newbytes', 1)
|
|
def index(self, sub, *args):
|
|
"""
|
|
Like newstr.find() but raise ValueError when the substring is not
|
|
found.
|
|
"""
|
|
pos = self.find(sub, *args)
|
|
if pos == -1:
|
|
raise ValueError('substring not found')
|
|
return pos
|
|
|
|
def splitlines(self, keepends=False):
|
|
"""
|
|
S.splitlines(keepends=False) -> list of strings
|
|
|
|
Return a list of the lines in S, breaking at line boundaries.
|
|
Line breaks are not included in the resulting list unless keepends
|
|
is given and true.
|
|
"""
|
|
# Py2 unicode.splitlines() takes keepends as an optional parameter,
|
|
# not as a keyword argument as in Python 3 str.
|
|
parts = super(newstr, self).splitlines(keepends)
|
|
return [newstr(part) for part in parts]
|
|
|
|
def __eq__(self, other):
|
|
if (isinstance(other, unicode) or
|
|
isinstance(other, bytes) and not isnewbytes(other)):
|
|
return super(newstr, self).__eq__(other)
|
|
else:
|
|
return NotImplemented
|
|
|
|
def __hash__(self):
|
|
if (isinstance(self, unicode) or
|
|
isinstance(self, bytes) and not isnewbytes(self)):
|
|
return super(newstr, self).__hash__()
|
|
else:
|
|
raise NotImplementedError()
|
|
|
|
def __ne__(self, other):
|
|
if (isinstance(other, unicode) or
|
|
isinstance(other, bytes) and not isnewbytes(other)):
|
|
return super(newstr, self).__ne__(other)
|
|
else:
|
|
return True
|
|
|
|
unorderable_err = 'unorderable types: str() and {0}'
|
|
|
|
def __lt__(self, other):
|
|
if (isinstance(other, unicode) or
|
|
isinstance(other, bytes) and not isnewbytes(other)):
|
|
return super(newstr, self).__lt__(other)
|
|
raise TypeError(self.unorderable_err.format(type(other)))
|
|
|
|
def __le__(self, other):
|
|
if (isinstance(other, unicode) or
|
|
isinstance(other, bytes) and not isnewbytes(other)):
|
|
return super(newstr, self).__le__(other)
|
|
raise TypeError(self.unorderable_err.format(type(other)))
|
|
|
|
def __gt__(self, other):
|
|
if (isinstance(other, unicode) or
|
|
isinstance(other, bytes) and not isnewbytes(other)):
|
|
return super(newstr, self).__gt__(other)
|
|
raise TypeError(self.unorderable_err.format(type(other)))
|
|
|
|
def __ge__(self, other):
|
|
if (isinstance(other, unicode) or
|
|
isinstance(other, bytes) and not isnewbytes(other)):
|
|
return super(newstr, self).__ge__(other)
|
|
raise TypeError(self.unorderable_err.format(type(other)))
|
|
|
|
def __getattribute__(self, name):
|
|
"""
|
|
A trick to cause the ``hasattr`` builtin-fn to return False for
|
|
the 'decode' method on Py2.
|
|
"""
|
|
if name in ['decode', u'decode']:
|
|
raise AttributeError("decode method has been disabled in newstr")
|
|
return super(newstr, self).__getattribute__(name)
|
|
|
|
def __native__(self):
|
|
"""
|
|
A hook for the future.utils.native() function.
|
|
"""
|
|
return unicode(self)
|
|
|
|
@staticmethod
|
|
def maketrans(x, y=None, z=None):
|
|
"""
|
|
Return a translation table usable for str.translate().
|
|
|
|
If there is only one argument, it must be a dictionary mapping Unicode
|
|
ordinals (integers) or characters to Unicode ordinals, strings or None.
|
|
Character keys will be then converted to ordinals.
|
|
If there are two arguments, they must be strings of equal length, and
|
|
in the resulting dictionary, each character in x will be mapped to the
|
|
character at the same position in y. If there is a third argument, it
|
|
must be a string, whose characters will be mapped to None in the result.
|
|
"""
|
|
|
|
if y is None:
|
|
assert z is None
|
|
if not isinstance(x, dict):
|
|
raise TypeError('if you give only one argument to maketrans it must be a dict')
|
|
result = {}
|
|
for (key, value) in x.items():
|
|
if len(key) > 1:
|
|
raise ValueError('keys in translate table must be strings or integers')
|
|
result[ord(key)] = value
|
|
else:
|
|
if not isinstance(x, unicode) and isinstance(y, unicode):
|
|
raise TypeError('x and y must be unicode strings')
|
|
if not len(x) == len(y):
|
|
raise ValueError('the first two maketrans arguments must have equal length')
|
|
result = {}
|
|
for (xi, yi) in zip(x, y):
|
|
if len(xi) > 1:
|
|
raise ValueError('keys in translate table must be strings or integers')
|
|
result[ord(xi)] = ord(yi)
|
|
|
|
if z is not None:
|
|
for char in z:
|
|
result[ord(char)] = None
|
|
return result
|
|
|
|
def translate(self, table):
|
|
"""
|
|
S.translate(table) -> str
|
|
|
|
Return a copy of the string S, where all characters have been mapped
|
|
through the given translation table, which must be a mapping of
|
|
Unicode ordinals to Unicode ordinals, strings, or None.
|
|
Unmapped characters are left untouched. Characters mapped to None
|
|
are deleted.
|
|
"""
|
|
l = []
|
|
for c in self:
|
|
if ord(c) in table:
|
|
val = table[ord(c)]
|
|
if val is None:
|
|
continue
|
|
elif isinstance(val, unicode):
|
|
l.append(val)
|
|
else:
|
|
l.append(chr(val))
|
|
else:
|
|
l.append(c)
|
|
return ''.join(l)
|
|
|
|
def isprintable(self):
|
|
raise NotImplementedError('fixme')
|
|
|
|
def isidentifier(self):
|
|
raise NotImplementedError('fixme')
|
|
|
|
def format_map(self):
|
|
raise NotImplementedError('fixme')
|
|
|
|
|
|
__all__ = ['newstr']
|