You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
bazarr/libs/charamel/resources/__init__.py

73 lines
1.9 KiB

"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import gzip
import pathlib
import struct
from typing import Any, Dict, List, Sequence
from charamel.encoding import Encoding
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
"""
Unpack struct values from file
Args:
file: File that stores struct-packed values
pattern: Struct pattern
Returns:
List of unpacked values
"""
with gzip.open(file, 'rb') as data:
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
def load_features() -> Dict[int, int]:
"""
Load byte-level feature names and indices
Returns:
Mapping from features to their indices in weight matrix
"""
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
return {feature: index for index, feature in enumerate(features)}
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
"""
Load linear model bias values for given encodings
Args:
encodings: List of encodings
Returns:
Mapping from encodings to their biases
"""
biases = {}
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
for line in data:
encoding, bias = line.decode().split()
biases[encoding] = float(bias)
return {encoding: biases[encoding] for encoding in encodings}
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
"""
:param encodings:
:return:
"""
weights = {}
for encoding in encodings:
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
return weights