parent
efafe4a126
commit
29671a4aff
@ -0,0 +1,20 @@
|
||||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Usage:
|
||||
>>> import charamel
|
||||
>>> detector = charamel.Detector()
|
||||
>>> content = b'El espa\xf1ol o castellano del lat\xedn hablado'
|
||||
>>> encoding = detector.detect(content)
|
||||
>>> encoding
|
||||
<Encoding.ISO_8859_14: 'iso8859_14'>
|
||||
>>> content.decode(encoding)
|
||||
'El español o castellano del latín hablado'
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
from .detector import Detector # noqa: F401
|
||||
from .encoding import Encoding # noqa: F401
|
||||
|
||||
__version__ = '1.0.0'
|
@ -0,0 +1,133 @@
|
||||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
import itertools
|
||||
import math
|
||||
from typing import Dict, List, Optional, Sequence, Set, Tuple
|
||||
|
||||
from charamel.encoding import Encoding
|
||||
from charamel.resources import load_biases, load_features, load_weights
|
||||
|
||||
|
||||
def _get_features(content: bytes) -> Set[int]:
|
||||
"""
|
||||
Extract unique byte uni-grams and bi-grams
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
|
||||
Returns:
|
||||
Set of integers that represent byte n-grams
|
||||
"""
|
||||
pairs = zip(content, itertools.islice(content, 1, None))
|
||||
return set(content).union(x * 256 + y for x, y in pairs)
|
||||
|
||||
|
||||
def _apply_sigmoid(value: float) -> float:
|
||||
"""
|
||||
Apply sigmoid function to given value
|
||||
"""
|
||||
return 1 / (1 + math.exp(-value))
|
||||
|
||||
|
||||
class Detector:
|
||||
"""
|
||||
Universal encoding detector
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encodings: Sequence[Encoding] = tuple(Encoding),
|
||||
min_confidence: float = 0.0,
|
||||
):
|
||||
"""
|
||||
Create universal encoding detector for given encodings
|
||||
|
||||
Args:
|
||||
encodings: Encodings that will be supported by this Detector instance,
|
||||
less encodings lead to faster runtime
|
||||
min_confidence: Minimum confidence threshold for encodings
|
||||
|
||||
Example:
|
||||
>>> detector = Detector(
|
||||
... encodings=[Encoding.UTF_8, Encoding.BIG_5],
|
||||
... min_confidence=0.7,
|
||||
... )
|
||||
"""
|
||||
if not encodings:
|
||||
raise ValueError('No encodings specified')
|
||||
|
||||
if not 0.0 <= min_confidence <= 1.0:
|
||||
raise ValueError('min_confidence must be in range [0, 1]')
|
||||
|
||||
self._features = load_features()
|
||||
self._weights = load_weights(encodings)
|
||||
self._biases = load_biases(encodings)
|
||||
self._min_confidence = min_confidence
|
||||
|
||||
def _score(self, content: bytes) -> Dict[Encoding, float]:
|
||||
"""
|
||||
Compute how likely each encoding is able to decode the content
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
|
||||
Returns:
|
||||
Real-valued score for each encoding
|
||||
"""
|
||||
scores = self._biases.copy()
|
||||
features = _get_features(content).intersection(self._features)
|
||||
indices = [self._features[feature] for feature in features]
|
||||
for encoding, weights in self._weights.items():
|
||||
scores[encoding] += sum(weights[index] for index in indices)
|
||||
return scores
|
||||
|
||||
def detect(self, content: bytes) -> Optional[Encoding]:
|
||||
"""
|
||||
Detect the most probable encoding for given byte content
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
|
||||
Returns:
|
||||
Encoding or `None` if not confident enough
|
||||
|
||||
Example:
|
||||
>>> detector = Detector()
|
||||
>>> detector.detect(b'\xc4\xe3\xba\xc3')
|
||||
<Encoding.GB_K: 'gbk'>
|
||||
"""
|
||||
scores = self._score(content)
|
||||
if scores:
|
||||
encoding, score = max(scores.items(), key=lambda x: x[1])
|
||||
if _apply_sigmoid(score) >= self._min_confidence:
|
||||
return encoding
|
||||
return None
|
||||
|
||||
def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
|
||||
"""
|
||||
Detect `top` probable encodings with confidences
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
top: How many of the most likely encodings to return
|
||||
|
||||
Example:
|
||||
>>> detector = Detector()
|
||||
>>> detector.probe(b'\xc4\xe3\xba\xc3')
|
||||
[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
|
||||
(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
|
||||
(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
|
||||
"""
|
||||
scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
|
||||
confidences = [
|
||||
(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
|
||||
]
|
||||
return [
|
||||
(encoding, confidence)
|
||||
for encoding, confidence in confidences
|
||||
if confidence >= self._min_confidence
|
||||
]
|
@ -0,0 +1,122 @@
|
||||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
import encodings.aliases
|
||||
import enum
|
||||
|
||||
|
||||
@enum.unique
|
||||
class Encoding(str, enum.Enum):
|
||||
"""
|
||||
Python character encodings
|
||||
"""
|
||||
|
||||
ASCII = 'ascii'
|
||||
BIG_5 = 'big5'
|
||||
BIG_5_HKSCS = 'big5hkscs'
|
||||
CP_037 = 'cp037'
|
||||
CP_273 = 'cp273'
|
||||
CP_424 = 'cp424'
|
||||
CP_437 = 'cp437'
|
||||
CP_500 = 'cp500'
|
||||
CP_720 = 'cp720'
|
||||
CP_737 = 'cp737'
|
||||
CP_775 = 'cp775'
|
||||
CP_850 = 'cp850'
|
||||
CP_852 = 'cp852'
|
||||
CP_855 = 'cp855'
|
||||
CP_856 = 'cp856'
|
||||
CP_857 = 'cp857'
|
||||
CP_858 = 'cp858'
|
||||
CP_860 = 'cp860'
|
||||
CP_861 = 'cp861'
|
||||
CP_862 = 'cp862'
|
||||
CP_863 = 'cp863'
|
||||
CP_864 = 'cp864'
|
||||
CP_865 = 'cp865'
|
||||
CP_866 = 'cp866'
|
||||
CP_869 = 'cp869'
|
||||
CP_874 = 'cp874'
|
||||
CP_875 = 'cp875'
|
||||
CP_932 = 'cp932'
|
||||
CP_949 = 'cp949'
|
||||
CP_950 = 'cp950'
|
||||
CP_1006 = 'cp1006'
|
||||
CP_1026 = 'cp1026'
|
||||
CP_1125 = 'cp1125'
|
||||
CP_1140 = 'cp1140'
|
||||
CP_1250 = 'cp1250'
|
||||
CP_1251 = 'cp1251'
|
||||
CP_1252 = 'cp1252'
|
||||
CP_1253 = 'cp1253'
|
||||
CP_1254 = 'cp1254'
|
||||
CP_1255 = 'cp1255'
|
||||
CP_1256 = 'cp1256'
|
||||
CP_1257 = 'cp1257'
|
||||
CP_1258 = 'cp1258'
|
||||
EUC_JP = 'euc_jp'
|
||||
EUC_JIS_2004 = 'euc_jis_2004'
|
||||
EUC_JIS_X_0213 = 'euc_jisx0213'
|
||||
EUC_KR = 'euc_kr'
|
||||
GB_2312 = 'gb2312'
|
||||
GB_K = 'gbk'
|
||||
GB_18030 = 'gb18030'
|
||||
HZ = 'hz'
|
||||
ISO_2022_JP = 'iso2022_jp'
|
||||
ISO_2022_JP_1 = 'iso2022_jp_1'
|
||||
ISO_2022_JP_2 = 'iso2022_jp_2'
|
||||
ISO_2022_JP_2004 = 'iso2022_jp_2004'
|
||||
ISO_2022_JP_3 = 'iso2022_jp_3'
|
||||
ISO_2022_JP_EXT = 'iso2022_jp_ext'
|
||||
ISO_2022_KR = 'iso2022_kr'
|
||||
LATIN_1 = 'latin_1'
|
||||
ISO_8859_2 = 'iso8859_2'
|
||||
ISO_8859_3 = 'iso8859_3'
|
||||
ISO_8859_4 = 'iso8859_4'
|
||||
ISO_8859_5 = 'iso8859_5'
|
||||
ISO_8859_6 = 'iso8859_6'
|
||||
ISO_8859_7 = 'iso8859_7'
|
||||
ISO_8859_8 = 'iso8859_8'
|
||||
ISO_8859_9 = 'iso8859_9'
|
||||
ISO_8859_10 = 'iso8859_10'
|
||||
ISO_8859_11 = 'iso8859_11'
|
||||
ISO_8859_13 = 'iso8859_13'
|
||||
ISO_8859_14 = 'iso8859_14'
|
||||
ISO_8859_15 = 'iso8859_15'
|
||||
ISO_8859_16 = 'iso8859_16'
|
||||
JOHAB = 'johab'
|
||||
KOI_8_R = 'koi8_r'
|
||||
KOI_8_T = 'koi8_t'
|
||||
KOI_8_U = 'koi8_u'
|
||||
KZ_1048 = 'kz1048'
|
||||
MAC_CYRILLIC = 'mac_cyrillic'
|
||||
MAC_GREEK = 'mac_greek'
|
||||
MAC_ICELAND = 'mac_iceland'
|
||||
MAC_LATIN_2 = 'mac_latin2'
|
||||
MAC_ROMAN = 'mac_roman'
|
||||
MAC_TURKISH = 'mac_turkish'
|
||||
PTCP_154 = 'ptcp154'
|
||||
SHIFT_JIS = 'shift_jis'
|
||||
SHIFT_JIS_2004 = 'shift_jis_2004'
|
||||
SHIFT_JIS_X_0213 = 'shift_jisx0213'
|
||||
TIS_620 = 'tis_620'
|
||||
UTF_32 = 'utf_32'
|
||||
UTF_32_BE = 'utf_32_be'
|
||||
UTF_32_LE = 'utf_32_le'
|
||||
UTF_16 = 'utf_16'
|
||||
UTF_16_BE = 'utf_16_be'
|
||||
UTF_16_LE = 'utf_16_le'
|
||||
UTF_7 = 'utf_7'
|
||||
UTF_8 = 'utf_8'
|
||||
UTF_8_SIG = 'utf_8_sig'
|
||||
|
||||
@classmethod
|
||||
def _missing_(cls, value):
|
||||
normalized = encodings.normalize_encoding(value).lower()
|
||||
normalized = encodings.aliases.aliases.get(normalized, normalized)
|
||||
if value != normalized:
|
||||
return cls(normalized)
|
||||
return super()._missing_(value)
|
@ -0,0 +1,72 @@
|
||||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
import gzip
|
||||
import pathlib
|
||||
import struct
|
||||
from typing import Any, Dict, List, Sequence
|
||||
|
||||
from charamel.encoding import Encoding
|
||||
|
||||
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
|
||||
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
|
||||
|
||||
|
||||
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
|
||||
"""
|
||||
Unpack struct values from file
|
||||
|
||||
Args:
|
||||
file: File that stores struct-packed values
|
||||
pattern: Struct pattern
|
||||
|
||||
Returns:
|
||||
List of unpacked values
|
||||
"""
|
||||
with gzip.open(file, 'rb') as data:
|
||||
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
|
||||
|
||||
|
||||
def load_features() -> Dict[int, int]:
|
||||
"""
|
||||
Load byte-level feature names and indices
|
||||
|
||||
Returns:
|
||||
Mapping from features to their indices in weight matrix
|
||||
"""
|
||||
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
|
||||
return {feature: index for index, feature in enumerate(features)}
|
||||
|
||||
|
||||
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
|
||||
"""
|
||||
Load linear model bias values for given encodings
|
||||
|
||||
Args:
|
||||
encodings: List of encodings
|
||||
|
||||
Returns:
|
||||
Mapping from encodings to their biases
|
||||
"""
|
||||
biases = {}
|
||||
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
|
||||
for line in data:
|
||||
encoding, bias = line.decode().split()
|
||||
biases[encoding] = float(bias)
|
||||
|
||||
return {encoding: biases[encoding] for encoding in encodings}
|
||||
|
||||
|
||||
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
|
||||
"""
|
||||
|
||||
:param encodings:
|
||||
:return:
|
||||
"""
|
||||
weights = {}
|
||||
for encoding in encodings:
|
||||
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
|
||||
return weights
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue