You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
1.9 KiB
73 lines
1.9 KiB
4 years ago
|
"""
|
||
|
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||
|
|
||
|
Licensed under Apache 2.0
|
||
|
"""
|
||
|
import gzip
|
||
|
import pathlib
|
||
|
import struct
|
||
|
from typing import Any, Dict, List, Sequence
|
||
|
|
||
|
from charamel.encoding import Encoding
|
||
|
|
||
|
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
|
||
|
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
|
||
|
|
||
|
|
||
|
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
|
||
|
"""
|
||
|
Unpack struct values from file
|
||
|
|
||
|
Args:
|
||
|
file: File that stores struct-packed values
|
||
|
pattern: Struct pattern
|
||
|
|
||
|
Returns:
|
||
|
List of unpacked values
|
||
|
"""
|
||
|
with gzip.open(file, 'rb') as data:
|
||
|
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
|
||
|
|
||
|
|
||
|
def load_features() -> Dict[int, int]:
|
||
|
"""
|
||
|
Load byte-level feature names and indices
|
||
|
|
||
|
Returns:
|
||
|
Mapping from features to their indices in weight matrix
|
||
|
"""
|
||
|
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
|
||
|
return {feature: index for index, feature in enumerate(features)}
|
||
|
|
||
|
|
||
|
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
|
||
|
"""
|
||
|
Load linear model bias values for given encodings
|
||
|
|
||
|
Args:
|
||
|
encodings: List of encodings
|
||
|
|
||
|
Returns:
|
||
|
Mapping from encodings to their biases
|
||
|
"""
|
||
|
biases = {}
|
||
|
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
|
||
|
for line in data:
|
||
|
encoding, bias = line.decode().split()
|
||
|
biases[encoding] = float(bias)
|
||
|
|
||
|
return {encoding: biases[encoding] for encoding in encodings}
|
||
|
|
||
|
|
||
|
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
|
||
|
"""
|
||
|
|
||
|
:param encodings:
|
||
|
:return:
|
||
|
"""
|
||
|
weights = {}
|
||
|
for encoding in encodings:
|
||
|
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
|
||
|
return weights
|