diff --git a/libs/charamel/__init__.py b/libs/charamel/__init__.py new file mode 100644 index 000000000..67d0f36c5 --- /dev/null +++ b/libs/charamel/__init__.py @@ -0,0 +1,20 @@ +""" +🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Usage: + >>> import charamel + >>> detector = charamel.Detector() + >>> content = b'El espa\xf1ol o castellano del lat\xedn hablado' + >>> encoding = detector.detect(content) + >>> encoding + + >>> content.decode(encoding) + 'El español o castellano del latín hablado' + +Licensed under Apache 2.0 +""" +from .detector import Detector # noqa: F401 +from .encoding import Encoding # noqa: F401 + +__version__ = '1.0.0' diff --git a/libs/charamel/detector.py b/libs/charamel/detector.py new file mode 100644 index 000000000..c63d307f4 --- /dev/null +++ b/libs/charamel/detector.py @@ -0,0 +1,133 @@ +""" +🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Licensed under Apache 2.0 +""" +import itertools +import math +from typing import Dict, List, Optional, Sequence, Set, Tuple + +from charamel.encoding import Encoding +from charamel.resources import load_biases, load_features, load_weights + + +def _get_features(content: bytes) -> Set[int]: + """ + Extract unique byte uni-grams and bi-grams + + Args: + content: Encoded text + + Returns: + Set of integers that represent byte n-grams + """ + pairs = zip(content, itertools.islice(content, 1, None)) + return set(content).union(x * 256 + y for x, y in pairs) + + +def _apply_sigmoid(value: float) -> float: + """ + Apply sigmoid function to given value + """ + return 1 / (1 + math.exp(-value)) + + +class Detector: + """ + Universal encoding detector + """ + + def __init__( + self, + encodings: Sequence[Encoding] = tuple(Encoding), + min_confidence: float = 0.0, + ): + """ + Create universal encoding detector for given encodings + + Args: + encodings: Encodings that will be supported by this Detector instance, + less encodings lead to faster runtime + min_confidence: Minimum confidence threshold for encodings + + Example: + >>> detector = Detector( + ... encodings=[Encoding.UTF_8, Encoding.BIG_5], + ... min_confidence=0.7, + ... ) + """ + if not encodings: + raise ValueError('No encodings specified') + + if not 0.0 <= min_confidence <= 1.0: + raise ValueError('min_confidence must be in range [0, 1]') + + self._features = load_features() + self._weights = load_weights(encodings) + self._biases = load_biases(encodings) + self._min_confidence = min_confidence + + def _score(self, content: bytes) -> Dict[Encoding, float]: + """ + Compute how likely each encoding is able to decode the content + + Args: + content: Encoded text + + Returns: + Real-valued score for each encoding + """ + scores = self._biases.copy() + features = _get_features(content).intersection(self._features) + indices = [self._features[feature] for feature in features] + for encoding, weights in self._weights.items(): + scores[encoding] += sum(weights[index] for index in indices) + return scores + + def detect(self, content: bytes) -> Optional[Encoding]: + """ + Detect the most probable encoding for given byte content + + Args: + content: Encoded text + + Returns: + Encoding or `None` if not confident enough + + Example: + >>> detector = Detector() + >>> detector.detect(b'\xc4\xe3\xba\xc3') + + """ + scores = self._score(content) + if scores: + encoding, score = max(scores.items(), key=lambda x: x[1]) + if _apply_sigmoid(score) >= self._min_confidence: + return encoding + return None + + def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]: + """ + Detect `top` probable encodings with confidences + + Args: + content: Encoded text + top: How many of the most likely encodings to return + + Example: + >>> detector = Detector() + >>> detector.probe(b'\xc4\xe3\xba\xc3') + [(, 0.6940633812304486), + (, 0.6886364021582343), + (, 0.6707061223726806)] + """ + scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True) + confidences = [ + (encoding, _apply_sigmoid(score)) for encoding, score in scores[:top] + ] + return [ + (encoding, confidence) + for encoding, confidence in confidences + if confidence >= self._min_confidence + ] diff --git a/libs/charamel/encoding.py b/libs/charamel/encoding.py new file mode 100644 index 000000000..21c5b095f --- /dev/null +++ b/libs/charamel/encoding.py @@ -0,0 +1,122 @@ +""" +🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Licensed under Apache 2.0 +""" +import encodings.aliases +import enum + + +@enum.unique +class Encoding(str, enum.Enum): + """ + Python character encodings + """ + + ASCII = 'ascii' + BIG_5 = 'big5' + BIG_5_HKSCS = 'big5hkscs' + CP_037 = 'cp037' + CP_273 = 'cp273' + CP_424 = 'cp424' + CP_437 = 'cp437' + CP_500 = 'cp500' + CP_720 = 'cp720' + CP_737 = 'cp737' + CP_775 = 'cp775' + CP_850 = 'cp850' + CP_852 = 'cp852' + CP_855 = 'cp855' + CP_856 = 'cp856' + CP_857 = 'cp857' + CP_858 = 'cp858' + CP_860 = 'cp860' + CP_861 = 'cp861' + CP_862 = 'cp862' + CP_863 = 'cp863' + CP_864 = 'cp864' + CP_865 = 'cp865' + CP_866 = 'cp866' + CP_869 = 'cp869' + CP_874 = 'cp874' + CP_875 = 'cp875' + CP_932 = 'cp932' + CP_949 = 'cp949' + CP_950 = 'cp950' + CP_1006 = 'cp1006' + CP_1026 = 'cp1026' + CP_1125 = 'cp1125' + CP_1140 = 'cp1140' + CP_1250 = 'cp1250' + CP_1251 = 'cp1251' + CP_1252 = 'cp1252' + CP_1253 = 'cp1253' + CP_1254 = 'cp1254' + CP_1255 = 'cp1255' + CP_1256 = 'cp1256' + CP_1257 = 'cp1257' + CP_1258 = 'cp1258' + EUC_JP = 'euc_jp' + EUC_JIS_2004 = 'euc_jis_2004' + EUC_JIS_X_0213 = 'euc_jisx0213' + EUC_KR = 'euc_kr' + GB_2312 = 'gb2312' + GB_K = 'gbk' + GB_18030 = 'gb18030' + HZ = 'hz' + ISO_2022_JP = 'iso2022_jp' + ISO_2022_JP_1 = 'iso2022_jp_1' + ISO_2022_JP_2 = 'iso2022_jp_2' + ISO_2022_JP_2004 = 'iso2022_jp_2004' + ISO_2022_JP_3 = 'iso2022_jp_3' + ISO_2022_JP_EXT = 'iso2022_jp_ext' + ISO_2022_KR = 'iso2022_kr' + LATIN_1 = 'latin_1' + ISO_8859_2 = 'iso8859_2' + ISO_8859_3 = 'iso8859_3' + ISO_8859_4 = 'iso8859_4' + ISO_8859_5 = 'iso8859_5' + ISO_8859_6 = 'iso8859_6' + ISO_8859_7 = 'iso8859_7' + ISO_8859_8 = 'iso8859_8' + ISO_8859_9 = 'iso8859_9' + ISO_8859_10 = 'iso8859_10' + ISO_8859_11 = 'iso8859_11' + ISO_8859_13 = 'iso8859_13' + ISO_8859_14 = 'iso8859_14' + ISO_8859_15 = 'iso8859_15' + ISO_8859_16 = 'iso8859_16' + JOHAB = 'johab' + KOI_8_R = 'koi8_r' + KOI_8_T = 'koi8_t' + KOI_8_U = 'koi8_u' + KZ_1048 = 'kz1048' + MAC_CYRILLIC = 'mac_cyrillic' + MAC_GREEK = 'mac_greek' + MAC_ICELAND = 'mac_iceland' + MAC_LATIN_2 = 'mac_latin2' + MAC_ROMAN = 'mac_roman' + MAC_TURKISH = 'mac_turkish' + PTCP_154 = 'ptcp154' + SHIFT_JIS = 'shift_jis' + SHIFT_JIS_2004 = 'shift_jis_2004' + SHIFT_JIS_X_0213 = 'shift_jisx0213' + TIS_620 = 'tis_620' + UTF_32 = 'utf_32' + UTF_32_BE = 'utf_32_be' + UTF_32_LE = 'utf_32_le' + UTF_16 = 'utf_16' + UTF_16_BE = 'utf_16_be' + UTF_16_LE = 'utf_16_le' + UTF_7 = 'utf_7' + UTF_8 = 'utf_8' + UTF_8_SIG = 'utf_8_sig' + + @classmethod + def _missing_(cls, value): + normalized = encodings.normalize_encoding(value).lower() + normalized = encodings.aliases.aliases.get(normalized, normalized) + if value != normalized: + return cls(normalized) + return super()._missing_(value) diff --git a/libs/charamel/resources/__init__.py b/libs/charamel/resources/__init__.py new file mode 100644 index 000000000..336c41c21 --- /dev/null +++ b/libs/charamel/resources/__init__.py @@ -0,0 +1,72 @@ +""" +🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Licensed under Apache 2.0 +""" +import gzip +import pathlib +import struct +from typing import Any, Dict, List, Sequence + +from charamel.encoding import Encoding + +RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute() +WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights' + + +def _unpack(file: pathlib.Path, pattern: str) -> List[Any]: + """ + Unpack struct values from file + + Args: + file: File that stores struct-packed values + pattern: Struct pattern + + Returns: + List of unpacked values + """ + with gzip.open(file, 'rb') as data: + return [values[0] for values in struct.iter_unpack(pattern, data.read())] + + +def load_features() -> Dict[int, int]: + """ + Load byte-level feature names and indices + + Returns: + Mapping from features to their indices in weight matrix + """ + features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H') + return {feature: index for index, feature in enumerate(features)} + + +def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]: + """ + Load linear model bias values for given encodings + + Args: + encodings: List of encodings + + Returns: + Mapping from encodings to their biases + """ + biases = {} + with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data: + for line in data: + encoding, bias = line.decode().split() + biases[encoding] = float(bias) + + return {encoding: biases[encoding] for encoding in encodings} + + +def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]: + """ + + :param encodings: + :return: + """ + weights = {} + for encoding in encodings: + weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e') + return weights diff --git a/libs/charamel/resources/biases.gzip b/libs/charamel/resources/biases.gzip new file mode 100644 index 000000000..ab2692313 Binary files /dev/null and b/libs/charamel/resources/biases.gzip differ diff --git a/libs/charamel/resources/features.gzip b/libs/charamel/resources/features.gzip new file mode 100644 index 000000000..281c773f1 Binary files /dev/null and b/libs/charamel/resources/features.gzip differ diff --git a/libs/charamel/resources/weights/ascii.gzip b/libs/charamel/resources/weights/ascii.gzip new file mode 100644 index 000000000..695e1b11c Binary files /dev/null and b/libs/charamel/resources/weights/ascii.gzip differ diff --git a/libs/charamel/resources/weights/big5.gzip b/libs/charamel/resources/weights/big5.gzip new file mode 100644 index 000000000..156368c44 Binary files /dev/null and b/libs/charamel/resources/weights/big5.gzip differ diff --git a/libs/charamel/resources/weights/big5hkscs.gzip b/libs/charamel/resources/weights/big5hkscs.gzip new file mode 100644 index 000000000..5fe8970a1 Binary files /dev/null and b/libs/charamel/resources/weights/big5hkscs.gzip differ diff --git a/libs/charamel/resources/weights/cp037.gzip b/libs/charamel/resources/weights/cp037.gzip new file mode 100644 index 000000000..1fa58d895 Binary files /dev/null and b/libs/charamel/resources/weights/cp037.gzip differ diff --git a/libs/charamel/resources/weights/cp1006.gzip b/libs/charamel/resources/weights/cp1006.gzip new file mode 100644 index 000000000..09cce0caf Binary files /dev/null and b/libs/charamel/resources/weights/cp1006.gzip differ diff --git a/libs/charamel/resources/weights/cp1026.gzip b/libs/charamel/resources/weights/cp1026.gzip new file mode 100644 index 000000000..8fe9bb84c Binary files /dev/null and b/libs/charamel/resources/weights/cp1026.gzip differ diff --git a/libs/charamel/resources/weights/cp1125.gzip b/libs/charamel/resources/weights/cp1125.gzip new file mode 100644 index 000000000..9100b6e4b Binary files /dev/null and b/libs/charamel/resources/weights/cp1125.gzip differ diff --git a/libs/charamel/resources/weights/cp1140.gzip b/libs/charamel/resources/weights/cp1140.gzip new file mode 100644 index 000000000..d8506c1c4 Binary files /dev/null and b/libs/charamel/resources/weights/cp1140.gzip differ diff --git a/libs/charamel/resources/weights/cp1250.gzip b/libs/charamel/resources/weights/cp1250.gzip new file mode 100644 index 000000000..1b9b1ba5b Binary files /dev/null and b/libs/charamel/resources/weights/cp1250.gzip differ diff --git a/libs/charamel/resources/weights/cp1251.gzip b/libs/charamel/resources/weights/cp1251.gzip new file mode 100644 index 000000000..a41146444 Binary files /dev/null and b/libs/charamel/resources/weights/cp1251.gzip differ diff --git a/libs/charamel/resources/weights/cp1252.gzip b/libs/charamel/resources/weights/cp1252.gzip new file mode 100644 index 000000000..3f87769a7 Binary files /dev/null and b/libs/charamel/resources/weights/cp1252.gzip differ diff --git a/libs/charamel/resources/weights/cp1253.gzip b/libs/charamel/resources/weights/cp1253.gzip new file mode 100644 index 000000000..e57a16719 Binary files /dev/null and b/libs/charamel/resources/weights/cp1253.gzip differ diff --git a/libs/charamel/resources/weights/cp1254.gzip b/libs/charamel/resources/weights/cp1254.gzip new file mode 100644 index 000000000..089f06899 Binary files /dev/null and b/libs/charamel/resources/weights/cp1254.gzip differ diff --git a/libs/charamel/resources/weights/cp1255.gzip b/libs/charamel/resources/weights/cp1255.gzip new file mode 100644 index 000000000..5c08a1c18 Binary files /dev/null and b/libs/charamel/resources/weights/cp1255.gzip differ diff --git a/libs/charamel/resources/weights/cp1256.gzip b/libs/charamel/resources/weights/cp1256.gzip new file mode 100644 index 000000000..a17d45b13 Binary files /dev/null and b/libs/charamel/resources/weights/cp1256.gzip differ diff --git a/libs/charamel/resources/weights/cp1257.gzip b/libs/charamel/resources/weights/cp1257.gzip new file mode 100644 index 000000000..efd13ef3d Binary files /dev/null and b/libs/charamel/resources/weights/cp1257.gzip differ diff --git a/libs/charamel/resources/weights/cp1258.gzip b/libs/charamel/resources/weights/cp1258.gzip new file mode 100644 index 000000000..8f546a44b Binary files /dev/null and b/libs/charamel/resources/weights/cp1258.gzip differ diff --git a/libs/charamel/resources/weights/cp273.gzip b/libs/charamel/resources/weights/cp273.gzip new file mode 100644 index 000000000..cce71cceb Binary files /dev/null and b/libs/charamel/resources/weights/cp273.gzip differ diff --git a/libs/charamel/resources/weights/cp424.gzip b/libs/charamel/resources/weights/cp424.gzip new file mode 100644 index 000000000..5a13c138e Binary files /dev/null and b/libs/charamel/resources/weights/cp424.gzip differ diff --git a/libs/charamel/resources/weights/cp437.gzip b/libs/charamel/resources/weights/cp437.gzip new file mode 100644 index 000000000..c60c68667 Binary files /dev/null and b/libs/charamel/resources/weights/cp437.gzip differ diff --git a/libs/charamel/resources/weights/cp500.gzip b/libs/charamel/resources/weights/cp500.gzip new file mode 100644 index 000000000..9461df494 Binary files /dev/null and b/libs/charamel/resources/weights/cp500.gzip differ diff --git a/libs/charamel/resources/weights/cp720.gzip b/libs/charamel/resources/weights/cp720.gzip new file mode 100644 index 000000000..b9d99e803 Binary files /dev/null and b/libs/charamel/resources/weights/cp720.gzip differ diff --git a/libs/charamel/resources/weights/cp737.gzip b/libs/charamel/resources/weights/cp737.gzip new file mode 100644 index 000000000..50a6feadd Binary files /dev/null and b/libs/charamel/resources/weights/cp737.gzip differ diff --git a/libs/charamel/resources/weights/cp775.gzip b/libs/charamel/resources/weights/cp775.gzip new file mode 100644 index 000000000..955569ee0 Binary files /dev/null and b/libs/charamel/resources/weights/cp775.gzip differ diff --git a/libs/charamel/resources/weights/cp850.gzip b/libs/charamel/resources/weights/cp850.gzip new file mode 100644 index 000000000..cf942dbd4 Binary files /dev/null and b/libs/charamel/resources/weights/cp850.gzip differ diff --git a/libs/charamel/resources/weights/cp852.gzip b/libs/charamel/resources/weights/cp852.gzip new file mode 100644 index 000000000..c8d5cec53 Binary files /dev/null and b/libs/charamel/resources/weights/cp852.gzip differ diff --git a/libs/charamel/resources/weights/cp855.gzip b/libs/charamel/resources/weights/cp855.gzip new file mode 100644 index 000000000..228100c4e Binary files /dev/null and b/libs/charamel/resources/weights/cp855.gzip differ diff --git a/libs/charamel/resources/weights/cp856.gzip b/libs/charamel/resources/weights/cp856.gzip new file mode 100644 index 000000000..28e1020f5 Binary files /dev/null and b/libs/charamel/resources/weights/cp856.gzip differ diff --git a/libs/charamel/resources/weights/cp857.gzip b/libs/charamel/resources/weights/cp857.gzip new file mode 100644 index 000000000..55bba8210 Binary files /dev/null and b/libs/charamel/resources/weights/cp857.gzip differ diff --git a/libs/charamel/resources/weights/cp858.gzip b/libs/charamel/resources/weights/cp858.gzip new file mode 100644 index 000000000..8f279d169 Binary files /dev/null and b/libs/charamel/resources/weights/cp858.gzip differ diff --git a/libs/charamel/resources/weights/cp860.gzip b/libs/charamel/resources/weights/cp860.gzip new file mode 100644 index 000000000..0b0914d17 Binary files /dev/null and b/libs/charamel/resources/weights/cp860.gzip differ diff --git a/libs/charamel/resources/weights/cp861.gzip b/libs/charamel/resources/weights/cp861.gzip new file mode 100644 index 000000000..2875d51d1 Binary files /dev/null and b/libs/charamel/resources/weights/cp861.gzip differ diff --git a/libs/charamel/resources/weights/cp862.gzip b/libs/charamel/resources/weights/cp862.gzip new file mode 100644 index 000000000..963f016b5 Binary files /dev/null and b/libs/charamel/resources/weights/cp862.gzip differ diff --git a/libs/charamel/resources/weights/cp863.gzip b/libs/charamel/resources/weights/cp863.gzip new file mode 100644 index 000000000..2ada2067d Binary files /dev/null and b/libs/charamel/resources/weights/cp863.gzip differ diff --git a/libs/charamel/resources/weights/cp864.gzip b/libs/charamel/resources/weights/cp864.gzip new file mode 100644 index 000000000..b6c0f573f Binary files /dev/null and b/libs/charamel/resources/weights/cp864.gzip differ diff --git a/libs/charamel/resources/weights/cp865.gzip b/libs/charamel/resources/weights/cp865.gzip new file mode 100644 index 000000000..f8c3f1e57 Binary files /dev/null and b/libs/charamel/resources/weights/cp865.gzip differ diff --git a/libs/charamel/resources/weights/cp866.gzip b/libs/charamel/resources/weights/cp866.gzip new file mode 100644 index 000000000..82fe8399e Binary files /dev/null and b/libs/charamel/resources/weights/cp866.gzip differ diff --git a/libs/charamel/resources/weights/cp869.gzip b/libs/charamel/resources/weights/cp869.gzip new file mode 100644 index 000000000..52d2ec86c Binary files /dev/null and b/libs/charamel/resources/weights/cp869.gzip differ diff --git a/libs/charamel/resources/weights/cp874.gzip b/libs/charamel/resources/weights/cp874.gzip new file mode 100644 index 000000000..e609ca582 Binary files /dev/null and b/libs/charamel/resources/weights/cp874.gzip differ diff --git a/libs/charamel/resources/weights/cp875.gzip b/libs/charamel/resources/weights/cp875.gzip new file mode 100644 index 000000000..75846231b Binary files /dev/null and b/libs/charamel/resources/weights/cp875.gzip differ diff --git a/libs/charamel/resources/weights/cp932.gzip b/libs/charamel/resources/weights/cp932.gzip new file mode 100644 index 000000000..ac18fe2af Binary files /dev/null and b/libs/charamel/resources/weights/cp932.gzip differ diff --git a/libs/charamel/resources/weights/cp949.gzip b/libs/charamel/resources/weights/cp949.gzip new file mode 100644 index 000000000..1882c6a49 Binary files /dev/null and b/libs/charamel/resources/weights/cp949.gzip differ diff --git a/libs/charamel/resources/weights/cp950.gzip b/libs/charamel/resources/weights/cp950.gzip new file mode 100644 index 000000000..04188859e Binary files /dev/null and b/libs/charamel/resources/weights/cp950.gzip differ diff --git a/libs/charamel/resources/weights/euc_jis_2004.gzip b/libs/charamel/resources/weights/euc_jis_2004.gzip new file mode 100644 index 000000000..21bda22a3 Binary files /dev/null and b/libs/charamel/resources/weights/euc_jis_2004.gzip differ diff --git a/libs/charamel/resources/weights/euc_jisx0213.gzip b/libs/charamel/resources/weights/euc_jisx0213.gzip new file mode 100644 index 000000000..26eb21868 Binary files /dev/null and b/libs/charamel/resources/weights/euc_jisx0213.gzip differ diff --git a/libs/charamel/resources/weights/euc_jp.gzip b/libs/charamel/resources/weights/euc_jp.gzip new file mode 100644 index 000000000..7a8ab341f Binary files /dev/null and b/libs/charamel/resources/weights/euc_jp.gzip differ diff --git a/libs/charamel/resources/weights/euc_kr.gzip b/libs/charamel/resources/weights/euc_kr.gzip new file mode 100644 index 000000000..2de76ddf2 Binary files /dev/null and b/libs/charamel/resources/weights/euc_kr.gzip differ diff --git a/libs/charamel/resources/weights/gb18030.gzip b/libs/charamel/resources/weights/gb18030.gzip new file mode 100644 index 000000000..5b714c3d7 Binary files /dev/null and b/libs/charamel/resources/weights/gb18030.gzip differ diff --git a/libs/charamel/resources/weights/gb2312.gzip b/libs/charamel/resources/weights/gb2312.gzip new file mode 100644 index 000000000..a671844be Binary files /dev/null and b/libs/charamel/resources/weights/gb2312.gzip differ diff --git a/libs/charamel/resources/weights/gbk.gzip b/libs/charamel/resources/weights/gbk.gzip new file mode 100644 index 000000000..86d2d0b77 Binary files /dev/null and b/libs/charamel/resources/weights/gbk.gzip differ diff --git a/libs/charamel/resources/weights/hz.gzip b/libs/charamel/resources/weights/hz.gzip new file mode 100644 index 000000000..8d924f6c0 Binary files /dev/null and b/libs/charamel/resources/weights/hz.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_jp.gzip b/libs/charamel/resources/weights/iso2022_jp.gzip new file mode 100644 index 000000000..edeef384b Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_jp.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_jp_1.gzip b/libs/charamel/resources/weights/iso2022_jp_1.gzip new file mode 100644 index 000000000..f37a21397 Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_jp_1.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_jp_2.gzip b/libs/charamel/resources/weights/iso2022_jp_2.gzip new file mode 100644 index 000000000..b19b26a86 Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_jp_2.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_jp_2004.gzip b/libs/charamel/resources/weights/iso2022_jp_2004.gzip new file mode 100644 index 000000000..62c9c3e48 Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_jp_2004.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_jp_3.gzip b/libs/charamel/resources/weights/iso2022_jp_3.gzip new file mode 100644 index 000000000..ac3de65e8 Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_jp_3.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_jp_ext.gzip b/libs/charamel/resources/weights/iso2022_jp_ext.gzip new file mode 100644 index 000000000..a10b13048 Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_jp_ext.gzip differ diff --git a/libs/charamel/resources/weights/iso2022_kr.gzip b/libs/charamel/resources/weights/iso2022_kr.gzip new file mode 100644 index 000000000..8a2d5e5c3 Binary files /dev/null and b/libs/charamel/resources/weights/iso2022_kr.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_10.gzip b/libs/charamel/resources/weights/iso8859_10.gzip new file mode 100644 index 000000000..1caf6ccb2 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_10.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_11.gzip b/libs/charamel/resources/weights/iso8859_11.gzip new file mode 100644 index 000000000..9d068f3e1 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_11.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_13.gzip b/libs/charamel/resources/weights/iso8859_13.gzip new file mode 100644 index 000000000..69fb36144 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_13.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_14.gzip b/libs/charamel/resources/weights/iso8859_14.gzip new file mode 100644 index 000000000..decd39764 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_14.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_15.gzip b/libs/charamel/resources/weights/iso8859_15.gzip new file mode 100644 index 000000000..3dd65041b Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_15.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_16.gzip b/libs/charamel/resources/weights/iso8859_16.gzip new file mode 100644 index 000000000..36f6d4874 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_16.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_2.gzip b/libs/charamel/resources/weights/iso8859_2.gzip new file mode 100644 index 000000000..c122280f1 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_2.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_3.gzip b/libs/charamel/resources/weights/iso8859_3.gzip new file mode 100644 index 000000000..1aac5dd11 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_3.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_4.gzip b/libs/charamel/resources/weights/iso8859_4.gzip new file mode 100644 index 000000000..3a26bac13 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_4.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_5.gzip b/libs/charamel/resources/weights/iso8859_5.gzip new file mode 100644 index 000000000..251b50988 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_5.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_6.gzip b/libs/charamel/resources/weights/iso8859_6.gzip new file mode 100644 index 000000000..0013b6425 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_6.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_7.gzip b/libs/charamel/resources/weights/iso8859_7.gzip new file mode 100644 index 000000000..7bf14906f Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_7.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_8.gzip b/libs/charamel/resources/weights/iso8859_8.gzip new file mode 100644 index 000000000..9bb4c3120 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_8.gzip differ diff --git a/libs/charamel/resources/weights/iso8859_9.gzip b/libs/charamel/resources/weights/iso8859_9.gzip new file mode 100644 index 000000000..d176af958 Binary files /dev/null and b/libs/charamel/resources/weights/iso8859_9.gzip differ diff --git a/libs/charamel/resources/weights/johab.gzip b/libs/charamel/resources/weights/johab.gzip new file mode 100644 index 000000000..c669f0f78 Binary files /dev/null and b/libs/charamel/resources/weights/johab.gzip differ diff --git a/libs/charamel/resources/weights/koi8_r.gzip b/libs/charamel/resources/weights/koi8_r.gzip new file mode 100644 index 000000000..31a59cbac Binary files /dev/null and b/libs/charamel/resources/weights/koi8_r.gzip differ diff --git a/libs/charamel/resources/weights/koi8_t.gzip b/libs/charamel/resources/weights/koi8_t.gzip new file mode 100644 index 000000000..2977f2602 Binary files /dev/null and b/libs/charamel/resources/weights/koi8_t.gzip differ diff --git a/libs/charamel/resources/weights/koi8_u.gzip b/libs/charamel/resources/weights/koi8_u.gzip new file mode 100644 index 000000000..c12c7f634 Binary files /dev/null and b/libs/charamel/resources/weights/koi8_u.gzip differ diff --git a/libs/charamel/resources/weights/kz1048.gzip b/libs/charamel/resources/weights/kz1048.gzip new file mode 100644 index 000000000..4ad027fc6 Binary files /dev/null and b/libs/charamel/resources/weights/kz1048.gzip differ diff --git a/libs/charamel/resources/weights/latin_1.gzip b/libs/charamel/resources/weights/latin_1.gzip new file mode 100644 index 000000000..3cf042fbe Binary files /dev/null and b/libs/charamel/resources/weights/latin_1.gzip differ diff --git a/libs/charamel/resources/weights/mac_cyrillic.gzip b/libs/charamel/resources/weights/mac_cyrillic.gzip new file mode 100644 index 000000000..a71344044 Binary files /dev/null and b/libs/charamel/resources/weights/mac_cyrillic.gzip differ diff --git a/libs/charamel/resources/weights/mac_greek.gzip b/libs/charamel/resources/weights/mac_greek.gzip new file mode 100644 index 000000000..34a1a8275 Binary files /dev/null and b/libs/charamel/resources/weights/mac_greek.gzip differ diff --git a/libs/charamel/resources/weights/mac_iceland.gzip b/libs/charamel/resources/weights/mac_iceland.gzip new file mode 100644 index 000000000..9bdd1accd Binary files /dev/null and b/libs/charamel/resources/weights/mac_iceland.gzip differ diff --git a/libs/charamel/resources/weights/mac_latin2.gzip b/libs/charamel/resources/weights/mac_latin2.gzip new file mode 100644 index 000000000..9771a6956 Binary files /dev/null and b/libs/charamel/resources/weights/mac_latin2.gzip differ diff --git a/libs/charamel/resources/weights/mac_roman.gzip b/libs/charamel/resources/weights/mac_roman.gzip new file mode 100644 index 000000000..cbe6140d0 Binary files /dev/null and b/libs/charamel/resources/weights/mac_roman.gzip differ diff --git a/libs/charamel/resources/weights/mac_turkish.gzip b/libs/charamel/resources/weights/mac_turkish.gzip new file mode 100644 index 000000000..d0ed3d730 Binary files /dev/null and b/libs/charamel/resources/weights/mac_turkish.gzip differ diff --git a/libs/charamel/resources/weights/ptcp154.gzip b/libs/charamel/resources/weights/ptcp154.gzip new file mode 100644 index 000000000..23605f00b Binary files /dev/null and b/libs/charamel/resources/weights/ptcp154.gzip differ diff --git a/libs/charamel/resources/weights/shift_jis.gzip b/libs/charamel/resources/weights/shift_jis.gzip new file mode 100644 index 000000000..713075b49 Binary files /dev/null and b/libs/charamel/resources/weights/shift_jis.gzip differ diff --git a/libs/charamel/resources/weights/shift_jis_2004.gzip b/libs/charamel/resources/weights/shift_jis_2004.gzip new file mode 100644 index 000000000..afc68af72 Binary files /dev/null and b/libs/charamel/resources/weights/shift_jis_2004.gzip differ diff --git a/libs/charamel/resources/weights/shift_jisx0213.gzip b/libs/charamel/resources/weights/shift_jisx0213.gzip new file mode 100644 index 000000000..aa3aa32f7 Binary files /dev/null and b/libs/charamel/resources/weights/shift_jisx0213.gzip differ diff --git a/libs/charamel/resources/weights/tis_620.gzip b/libs/charamel/resources/weights/tis_620.gzip new file mode 100644 index 000000000..bebae49da Binary files /dev/null and b/libs/charamel/resources/weights/tis_620.gzip differ diff --git a/libs/charamel/resources/weights/utf_16.gzip b/libs/charamel/resources/weights/utf_16.gzip new file mode 100644 index 000000000..3ac44b14f Binary files /dev/null and b/libs/charamel/resources/weights/utf_16.gzip differ diff --git a/libs/charamel/resources/weights/utf_16_be.gzip b/libs/charamel/resources/weights/utf_16_be.gzip new file mode 100644 index 000000000..68e8024c1 Binary files /dev/null and b/libs/charamel/resources/weights/utf_16_be.gzip differ diff --git a/libs/charamel/resources/weights/utf_16_le.gzip b/libs/charamel/resources/weights/utf_16_le.gzip new file mode 100644 index 000000000..4790b3a6e Binary files /dev/null and b/libs/charamel/resources/weights/utf_16_le.gzip differ diff --git a/libs/charamel/resources/weights/utf_32.gzip b/libs/charamel/resources/weights/utf_32.gzip new file mode 100644 index 000000000..4599cac24 Binary files /dev/null and b/libs/charamel/resources/weights/utf_32.gzip differ diff --git a/libs/charamel/resources/weights/utf_32_be.gzip b/libs/charamel/resources/weights/utf_32_be.gzip new file mode 100644 index 000000000..b70600e61 Binary files /dev/null and b/libs/charamel/resources/weights/utf_32_be.gzip differ diff --git a/libs/charamel/resources/weights/utf_32_le.gzip b/libs/charamel/resources/weights/utf_32_le.gzip new file mode 100644 index 000000000..4ab2e68a5 Binary files /dev/null and b/libs/charamel/resources/weights/utf_32_le.gzip differ diff --git a/libs/charamel/resources/weights/utf_7.gzip b/libs/charamel/resources/weights/utf_7.gzip new file mode 100644 index 000000000..13a4337ee Binary files /dev/null and b/libs/charamel/resources/weights/utf_7.gzip differ diff --git a/libs/charamel/resources/weights/utf_8.gzip b/libs/charamel/resources/weights/utf_8.gzip new file mode 100644 index 000000000..66966287e Binary files /dev/null and b/libs/charamel/resources/weights/utf_8.gzip differ diff --git a/libs/charamel/resources/weights/utf_8_sig.gzip b/libs/charamel/resources/weights/utf_8_sig.gzip new file mode 100644 index 000000000..78567848d Binary files /dev/null and b/libs/charamel/resources/weights/utf_8_sig.gzip differ