bazarr/libs/charamel/detector.py

"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Licensed under Apache 2.0
"""
import itertools
import math
from typing import Dict, List, Optional, Sequence, Set, Tuple

from charamel.encoding import Encoding
from charamel.resources import load_biases, load_features, load_weights


def _get_features(content: bytes) -> Set[int]:
    """
    Extract unique byte uni-grams and bi-grams

    Args:
        content: Encoded text

    Returns:
        Set of integers that represent byte n-grams
    """
    pairs = zip(content, itertools.islice(content, 1, None))
    return set(content).union(x * 256 + y for x, y in pairs)


def _apply_sigmoid(value: float) -> float:
    """
    Apply sigmoid function to given value
    """
    return 1 / (1 + math.exp(-value))


class Detector:
    """
    Universal encoding detector
    """

    def __init__(
        self,
        encodings: Sequence[Encoding] = tuple(Encoding),
        min_confidence: float = 0.0,
    ):
        """
        Create universal encoding detector for given encodings

        Args:
            encodings: Encodings that will be supported by this Detector instance,
                less encodings lead to faster runtime
            min_confidence: Minimum confidence threshold for encodings

        Example:
            >>> detector = Detector(
            ...     encodings=[Encoding.UTF_8, Encoding.BIG_5],
            ...     min_confidence=0.7,
            ... )
        """
        if not encodings:
            raise ValueError('No encodings specified')

        if not 0.0 <= min_confidence <= 1.0:
            raise ValueError('min_confidence must be in range [0, 1]')

        self._features = load_features()
        self._weights = load_weights(encodings)
        self._biases = load_biases(encodings)
        self._min_confidence = min_confidence

    def _score(self, content: bytes) -> Dict[Encoding, float]:
        """
        Compute how likely each encoding is able to decode the content

        Args:
            content: Encoded text

        Returns:
            Real-valued score for each encoding
        """
        scores = self._biases.copy()
        features = _get_features(content).intersection(self._features)
        indices = [self._features[feature] for feature in features]
        for encoding, weights in self._weights.items():
            scores[encoding] += sum(weights[index] for index in indices)
        return scores

    def detect(self, content: bytes) -> Optional[Encoding]:
        """
        Detect the most probable encoding for given byte content

        Args:
            content: Encoded text

        Returns:
            Encoding or `None` if not confident enough

        Example:
            >>> detector = Detector()
            >>> detector.detect(b'\xc4\xe3\xba\xc3')
            <Encoding.GB_K: 'gbk'>
        """
        scores = self._score(content)
        if scores:
            encoding, score = max(scores.items(), key=lambda x: x[1])
            if _apply_sigmoid(score) >= self._min_confidence:
                return encoding
        return None

    def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
        """
        Detect `top` probable encodings with confidences

        Args:
            content: Encoded text
            top: How many of the most likely encodings to return

        Example:
            >>> detector = Detector()
            >>> detector.probe(b'\xc4\xe3\xba\xc3')
            [(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
             (<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
             (<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
        """
        scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
        confidences = [
            (encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
        ]
        return [
            (encoding, confidence)
            for encoding, confidence in confidences
            if confidence >= self._min_confidence
        ]
Replaced chardet by charamel to improve character encoding detection. 4 years ago			`"""`
			`🌏 Charamel: Truly Universal Encoding Detection in Python 🌎`
			`~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`

			`Licensed under Apache 2.0`
			`"""`
			`import itertools`
			`import math`
			`from typing import Dict, List, Optional, Sequence, Set, Tuple`

			`from charamel.encoding import Encoding`
			`from charamel.resources import load_biases, load_features, load_weights`


			`def _get_features(content: bytes) -> Set[int]:`
			`"""`
			`Extract unique byte uni-grams and bi-grams`

			`Args:`
			`content: Encoded text`

			`Returns:`
			`Set of integers that represent byte n-grams`
			`"""`
			`pairs = zip(content, itertools.islice(content, 1, None))`
			`return set(content).union(x * 256 + y for x, y in pairs)`


			`def _apply_sigmoid(value: float) -> float:`
			`"""`
			`Apply sigmoid function to given value`
			`"""`
			`return 1 / (1 + math.exp(-value))`


			`class Detector:`
			`"""`
			`Universal encoding detector`
			`"""`

			`def __init__(`
			`self,`
			`encodings: Sequence[Encoding] = tuple(Encoding),`
			`min_confidence: float = 0.0,`
			`):`
			`"""`
			`Create universal encoding detector for given encodings`

			`Args:`
			`encodings: Encodings that will be supported by this Detector instance,`
			`less encodings lead to faster runtime`
			`min_confidence: Minimum confidence threshold for encodings`

			`Example:`
			`>>> detector = Detector(`
			`... encodings=[Encoding.UTF_8, Encoding.BIG_5],`
			`... min_confidence=0.7,`
			`... )`
			`"""`
			`if not encodings:`
			`raise ValueError('No encodings specified')`

			`if not 0.0 <= min_confidence <= 1.0:`
			`raise ValueError('min_confidence must be in range [0, 1]')`

			`self._features = load_features()`
			`self._weights = load_weights(encodings)`
			`self._biases = load_biases(encodings)`
			`self._min_confidence = min_confidence`

			`def _score(self, content: bytes) -> Dict[Encoding, float]:`
			`"""`
			`Compute how likely each encoding is able to decode the content`

			`Args:`
			`content: Encoded text`

			`Returns:`
			`Real-valued score for each encoding`
			`"""`
			`scores = self._biases.copy()`
			`features = _get_features(content).intersection(self._features)`
			`indices = [self._features[feature] for feature in features]`
			`for encoding, weights in self._weights.items():`
			`scores[encoding] += sum(weights[index] for index in indices)`
			`return scores`

			`def detect(self, content: bytes) -> Optional[Encoding]:`
			`"""`
			`Detect the most probable encoding for given byte content`

			`Args:`
			`content: Encoded text`

			`Returns:`
			Encoding or `None` if not confident enough

			`Example:`
			`>>> detector = Detector()`
			`>>> detector.detect(b'\xc4\xe3\xba\xc3')`
			`<Encoding.GB_K: 'gbk'>`
			`"""`
			`scores = self._score(content)`
			`if scores:`
			`encoding, score = max(scores.items(), key=lambda x: x[1])`
			`if _apply_sigmoid(score) >= self._min_confidence:`
			`return encoding`
			`return None`

			`def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:`
			`"""`
			Detect `top` probable encodings with confidences

			`Args:`
			`content: Encoded text`
			`top: How many of the most likely encodings to return`

			`Example:`
			`>>> detector = Detector()`
			`>>> detector.probe(b'\xc4\xe3\xba\xc3')`
			`[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),`
			`(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),`
			`(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]`
			`"""`
			`scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)`
			`confidences = [`
			`(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]`
			`]`
			`return [`
			`(encoding, confidence)`
			`for encoding, confidence in confidences`
			`if confidence >= self._min_confidence`
			`]`