You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
123 lines
3.0 KiB
123 lines
3.0 KiB
4 years ago
|
"""
|
||
|
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||
|
|
||
|
Licensed under Apache 2.0
|
||
|
"""
|
||
|
import encodings.aliases
|
||
|
import enum
|
||
|
|
||
|
|
||
|
@enum.unique
|
||
|
class Encoding(str, enum.Enum):
|
||
|
"""
|
||
|
Python character encodings
|
||
|
"""
|
||
|
|
||
|
ASCII = 'ascii'
|
||
|
BIG_5 = 'big5'
|
||
|
BIG_5_HKSCS = 'big5hkscs'
|
||
|
CP_037 = 'cp037'
|
||
|
CP_273 = 'cp273'
|
||
|
CP_424 = 'cp424'
|
||
|
CP_437 = 'cp437'
|
||
|
CP_500 = 'cp500'
|
||
|
CP_720 = 'cp720'
|
||
|
CP_737 = 'cp737'
|
||
|
CP_775 = 'cp775'
|
||
|
CP_850 = 'cp850'
|
||
|
CP_852 = 'cp852'
|
||
|
CP_855 = 'cp855'
|
||
|
CP_856 = 'cp856'
|
||
|
CP_857 = 'cp857'
|
||
|
CP_858 = 'cp858'
|
||
|
CP_860 = 'cp860'
|
||
|
CP_861 = 'cp861'
|
||
|
CP_862 = 'cp862'
|
||
|
CP_863 = 'cp863'
|
||
|
CP_864 = 'cp864'
|
||
|
CP_865 = 'cp865'
|
||
|
CP_866 = 'cp866'
|
||
|
CP_869 = 'cp869'
|
||
|
CP_874 = 'cp874'
|
||
|
CP_875 = 'cp875'
|
||
|
CP_932 = 'cp932'
|
||
|
CP_949 = 'cp949'
|
||
|
CP_950 = 'cp950'
|
||
|
CP_1006 = 'cp1006'
|
||
|
CP_1026 = 'cp1026'
|
||
|
CP_1125 = 'cp1125'
|
||
|
CP_1140 = 'cp1140'
|
||
|
CP_1250 = 'cp1250'
|
||
|
CP_1251 = 'cp1251'
|
||
|
CP_1252 = 'cp1252'
|
||
|
CP_1253 = 'cp1253'
|
||
|
CP_1254 = 'cp1254'
|
||
|
CP_1255 = 'cp1255'
|
||
|
CP_1256 = 'cp1256'
|
||
|
CP_1257 = 'cp1257'
|
||
|
CP_1258 = 'cp1258'
|
||
|
EUC_JP = 'euc_jp'
|
||
|
EUC_JIS_2004 = 'euc_jis_2004'
|
||
|
EUC_JIS_X_0213 = 'euc_jisx0213'
|
||
|
EUC_KR = 'euc_kr'
|
||
|
GB_2312 = 'gb2312'
|
||
|
GB_K = 'gbk'
|
||
|
GB_18030 = 'gb18030'
|
||
|
HZ = 'hz'
|
||
|
ISO_2022_JP = 'iso2022_jp'
|
||
|
ISO_2022_JP_1 = 'iso2022_jp_1'
|
||
|
ISO_2022_JP_2 = 'iso2022_jp_2'
|
||
|
ISO_2022_JP_2004 = 'iso2022_jp_2004'
|
||
|
ISO_2022_JP_3 = 'iso2022_jp_3'
|
||
|
ISO_2022_JP_EXT = 'iso2022_jp_ext'
|
||
|
ISO_2022_KR = 'iso2022_kr'
|
||
|
LATIN_1 = 'latin_1'
|
||
|
ISO_8859_2 = 'iso8859_2'
|
||
|
ISO_8859_3 = 'iso8859_3'
|
||
|
ISO_8859_4 = 'iso8859_4'
|
||
|
ISO_8859_5 = 'iso8859_5'
|
||
|
ISO_8859_6 = 'iso8859_6'
|
||
|
ISO_8859_7 = 'iso8859_7'
|
||
|
ISO_8859_8 = 'iso8859_8'
|
||
|
ISO_8859_9 = 'iso8859_9'
|
||
|
ISO_8859_10 = 'iso8859_10'
|
||
|
ISO_8859_11 = 'iso8859_11'
|
||
|
ISO_8859_13 = 'iso8859_13'
|
||
|
ISO_8859_14 = 'iso8859_14'
|
||
|
ISO_8859_15 = 'iso8859_15'
|
||
|
ISO_8859_16 = 'iso8859_16'
|
||
|
JOHAB = 'johab'
|
||
|
KOI_8_R = 'koi8_r'
|
||
|
KOI_8_T = 'koi8_t'
|
||
|
KOI_8_U = 'koi8_u'
|
||
|
KZ_1048 = 'kz1048'
|
||
|
MAC_CYRILLIC = 'mac_cyrillic'
|
||
|
MAC_GREEK = 'mac_greek'
|
||
|
MAC_ICELAND = 'mac_iceland'
|
||
|
MAC_LATIN_2 = 'mac_latin2'
|
||
|
MAC_ROMAN = 'mac_roman'
|
||
|
MAC_TURKISH = 'mac_turkish'
|
||
|
PTCP_154 = 'ptcp154'
|
||
|
SHIFT_JIS = 'shift_jis'
|
||
|
SHIFT_JIS_2004 = 'shift_jis_2004'
|
||
|
SHIFT_JIS_X_0213 = 'shift_jisx0213'
|
||
|
TIS_620 = 'tis_620'
|
||
|
UTF_32 = 'utf_32'
|
||
|
UTF_32_BE = 'utf_32_be'
|
||
|
UTF_32_LE = 'utf_32_le'
|
||
|
UTF_16 = 'utf_16'
|
||
|
UTF_16_BE = 'utf_16_be'
|
||
|
UTF_16_LE = 'utf_16_le'
|
||
|
UTF_7 = 'utf_7'
|
||
|
UTF_8 = 'utf_8'
|
||
|
UTF_8_SIG = 'utf_8_sig'
|
||
|
|
||
|
@classmethod
|
||
|
def _missing_(cls, value):
|
||
|
normalized = encodings.normalize_encoding(value).lower()
|
||
|
normalized = encodings.aliases.aliases.get(normalized, normalized)
|
||
|
if value != normalized:
|
||
|
return cls(normalized)
|
||
|
return super()._missing_(value)
|