You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
134 lines
4.0 KiB
134 lines
4.0 KiB
"""
|
|
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
Licensed under Apache 2.0
|
|
"""
|
|
import itertools
|
|
import math
|
|
from typing import Dict, List, Optional, Sequence, Set, Tuple
|
|
|
|
from charamel.encoding import Encoding
|
|
from charamel.resources import load_biases, load_features, load_weights
|
|
|
|
|
|
def _get_features(content: bytes) -> Set[int]:
|
|
"""
|
|
Extract unique byte uni-grams and bi-grams
|
|
|
|
Args:
|
|
content: Encoded text
|
|
|
|
Returns:
|
|
Set of integers that represent byte n-grams
|
|
"""
|
|
pairs = zip(content, itertools.islice(content, 1, None))
|
|
return set(content).union(x * 256 + y for x, y in pairs)
|
|
|
|
|
|
def _apply_sigmoid(value: float) -> float:
|
|
"""
|
|
Apply sigmoid function to given value
|
|
"""
|
|
return 1 / (1 + math.exp(-value))
|
|
|
|
|
|
class Detector:
|
|
"""
|
|
Universal encoding detector
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
encodings: Sequence[Encoding] = tuple(Encoding),
|
|
min_confidence: float = 0.0,
|
|
):
|
|
"""
|
|
Create universal encoding detector for given encodings
|
|
|
|
Args:
|
|
encodings: Encodings that will be supported by this Detector instance,
|
|
less encodings lead to faster runtime
|
|
min_confidence: Minimum confidence threshold for encodings
|
|
|
|
Example:
|
|
>>> detector = Detector(
|
|
... encodings=[Encoding.UTF_8, Encoding.BIG_5],
|
|
... min_confidence=0.7,
|
|
... )
|
|
"""
|
|
if not encodings:
|
|
raise ValueError('No encodings specified')
|
|
|
|
if not 0.0 <= min_confidence <= 1.0:
|
|
raise ValueError('min_confidence must be in range [0, 1]')
|
|
|
|
self._features = load_features()
|
|
self._weights = load_weights(encodings)
|
|
self._biases = load_biases(encodings)
|
|
self._min_confidence = min_confidence
|
|
|
|
def _score(self, content: bytes) -> Dict[Encoding, float]:
|
|
"""
|
|
Compute how likely each encoding is able to decode the content
|
|
|
|
Args:
|
|
content: Encoded text
|
|
|
|
Returns:
|
|
Real-valued score for each encoding
|
|
"""
|
|
scores = self._biases.copy()
|
|
features = _get_features(content).intersection(self._features)
|
|
indices = [self._features[feature] for feature in features]
|
|
for encoding, weights in self._weights.items():
|
|
scores[encoding] += sum(weights[index] for index in indices)
|
|
return scores
|
|
|
|
def detect(self, content: bytes) -> Optional[Encoding]:
|
|
"""
|
|
Detect the most probable encoding for given byte content
|
|
|
|
Args:
|
|
content: Encoded text
|
|
|
|
Returns:
|
|
Encoding or `None` if not confident enough
|
|
|
|
Example:
|
|
>>> detector = Detector()
|
|
>>> detector.detect(b'\xc4\xe3\xba\xc3')
|
|
<Encoding.GB_K: 'gbk'>
|
|
"""
|
|
scores = self._score(content)
|
|
if scores:
|
|
encoding, score = max(scores.items(), key=lambda x: x[1])
|
|
if _apply_sigmoid(score) >= self._min_confidence:
|
|
return encoding
|
|
return None
|
|
|
|
def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
|
|
"""
|
|
Detect `top` probable encodings with confidences
|
|
|
|
Args:
|
|
content: Encoded text
|
|
top: How many of the most likely encodings to return
|
|
|
|
Example:
|
|
>>> detector = Detector()
|
|
>>> detector.probe(b'\xc4\xe3\xba\xc3')
|
|
[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
|
|
(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
|
|
(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
|
|
"""
|
|
scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
|
|
confidences = [
|
|
(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
|
|
]
|
|
return [
|
|
(encoding, confidence)
|
|
for encoding, confidence in confidences
|
|
if confidence >= self._min_confidence
|
|
]
|