Added experimental Python 3.11 support

pull/2182/head
morpheus65535 12 months ago
parent cd016840f9
commit c92d574bf2

@ -20,8 +20,8 @@ def check_python_version():
print("Python " + minimum_py3_str + " or greater required. " print("Python " + minimum_py3_str + " or greater required. "
"Current version is " + platform.python_version() + ". Please upgrade Python.") "Current version is " + platform.python_version() + ". Please upgrade Python.")
sys.exit(1) sys.exit(1)
elif int(python_version[0]) == 3 and int(python_version[1]) > 10: elif int(python_version[0]) == 3 and int(python_version[1]) > 11:
print("Python version greater than 3.10.x is unsupported. Current version is " + platform.python_version() + print("Python version greater than 3.11.x is unsupported. Current version is " + platform.python_version() +
". Keep in mind that even if it works, you're on your own.") ". Keep in mind that even if it works, you're on your own.")
elif (int(python_version[0]) == minimum_py3_tuple[0] and int(python_version[1]) < minimum_py3_tuple[1]) or \ elif (int(python_version[0]) == minimum_py3_tuple[0] and int(python_version[1]) < minimum_py3_tuple[1]) or \
(int(python_version[0]) != minimum_py3_tuple[0]): (int(python_version[0]) != minimum_py3_tuple[0]):

@ -7,7 +7,7 @@ import re
from guess_language import guess_language from guess_language import guess_language
from subliminal_patch import core from subliminal_patch import core
from subzero.language import Language from subzero.language import Language
from charamel import Detector from chardet import detect
from app.config import settings from app.config import settings
from constants import hi_regex from constants import hi_regex
@ -76,7 +76,12 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
with open(subtitle_path, 'rb') as f: with open(subtitle_path, 'rb') as f:
text = f.read() text = f.read()
try: encoding = detect(text)['encoding']
if not encoding:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path)
continue
if 'UTF' in encoding:
text = text.decode('utf-8') text = text.decode('utf-8')
detected_language = guess_language(text) detected_language = guess_language(text)
# add simplified and traditional chinese detection # add simplified and traditional chinese detection
@ -86,27 +91,10 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
".hant", ".big5", ".traditional"] ".hant", ".big5", ".traditional"]
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(traditional_chinese)) or (str(subtitle_path).lower())[:-5] in traditional_chinese_fuzzy: if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(traditional_chinese)) or (str(subtitle_path).lower())[:-5] in traditional_chinese_fuzzy:
detected_language == 'zt' detected_language == 'zt'
except UnicodeDecodeError:
detector = Detector()
try:
guess = detector.detect(text)
except Exception:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path)
continue
else: else:
logging.debug('BAZARR detected encoding %r', guess) text = text.decode(encoding)
try:
text = text.decode(guess)
except Exception:
logging.debug(
"BAZARR skipping this subtitles because we can't decode the file using the "
"guessed encoding. It's probably a binary file: " + subtitle_path)
continue
detected_language = guess_language(text) detected_language = guess_language(text)
except Exception:
logging.debug('BAZARR was unable to detect encoding for this subtitles file: %r', subtitle_path)
finally:
if detected_language: if detected_language:
logging.debug("BAZARR external subtitles detected and guessed this language: " + str( logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
detected_language)) detected_language))
@ -139,24 +127,15 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
with open(subtitle_path, 'rb') as f: with open(subtitle_path, 'rb') as f:
text = f.read() text = f.read()
try: encoding = detect(text)['encoding']
text = text.decode('utf-8') if not encoding:
except UnicodeDecodeError:
detector = Detector()
try:
guess = detector.detect(text)
except Exception:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. " logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path) "It's probably a binary file: " + subtitle_path)
continue continue
if 'UTF' in encoding:
text = text.decode('utf-8')
else: else:
logging.debug('BAZARR detected encoding %r', guess) text = text.decode(encoding)
try:
text = text.decode(guess)
except Exception:
logging.debug("BAZARR skipping this subtitles because we can't decode the file using the "
"guessed encoding. It's probably a binary file: " + subtitle_path)
continue
if bool(re.search(hi_regex, text)): if bool(re.search(hi_regex, text)):
subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True) subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)

@ -4,7 +4,7 @@ import os
import logging import logging
import hashlib import hashlib
from charamel import Detector from chardet import detect
from bs4 import UnicodeDammit from bs4 import UnicodeDammit
from app.config import settings from app.config import settings
@ -64,8 +64,7 @@ def force_unicode(s):
try: try:
s = s.decode("utf-8") s = s.decode("utf-8")
except UnicodeDecodeError: except UnicodeDecodeError:
detector = Detector() t = detect(s)['encoding']
t = detector.detect(s)
try: try:
s = s.decode(t) s = s.decode(t)
except UnicodeDecodeError: except UnicodeDecodeError:

@ -1,20 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Usage:
>>> import charamel
>>> detector = charamel.Detector()
>>> content = b'El espa\xf1ol o castellano del lat\xedn hablado'
>>> encoding = detector.detect(content)
>>> encoding
<Encoding.ISO_8859_14: 'iso8859_14'>
>>> content.decode(encoding)
'El español o castellano del latín hablado'
Licensed under Apache 2.0
"""
from .detector import Detector # noqa: F401
from .encoding import Encoding # noqa: F401
__version__ = '1.0.0'

@ -1,133 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import itertools
import math
from typing import Dict, List, Optional, Sequence, Set, Tuple
from charamel.encoding import Encoding
from charamel.resources import load_biases, load_features, load_weights
def _get_features(content: bytes) -> Set[int]:
"""
Extract unique byte uni-grams and bi-grams
Args:
content: Encoded text
Returns:
Set of integers that represent byte n-grams
"""
pairs = zip(content, itertools.islice(content, 1, None))
return set(content).union(x * 256 + y for x, y in pairs)
def _apply_sigmoid(value: float) -> float:
"""
Apply sigmoid function to given value
"""
return 1 / (1 + math.exp(-value))
class Detector:
"""
Universal encoding detector
"""
def __init__(
self,
encodings: Sequence[Encoding] = tuple(Encoding),
min_confidence: float = 0.0,
):
"""
Create universal encoding detector for given encodings
Args:
encodings: Encodings that will be supported by this Detector instance,
less encodings lead to faster runtime
min_confidence: Minimum confidence threshold for encodings
Example:
>>> detector = Detector(
... encodings=[Encoding.UTF_8, Encoding.BIG_5],
... min_confidence=0.7,
... )
"""
if not encodings:
raise ValueError('No encodings specified')
if not 0.0 <= min_confidence <= 1.0:
raise ValueError('min_confidence must be in range [0, 1]')
self._features = load_features()
self._weights = load_weights(encodings)
self._biases = load_biases(encodings)
self._min_confidence = min_confidence
def _score(self, content: bytes) -> Dict[Encoding, float]:
"""
Compute how likely each encoding is able to decode the content
Args:
content: Encoded text
Returns:
Real-valued score for each encoding
"""
scores = self._biases.copy()
features = _get_features(content).intersection(self._features)
indices = [self._features[feature] for feature in features]
for encoding, weights in self._weights.items():
scores[encoding] += sum(weights[index] for index in indices)
return scores
def detect(self, content: bytes) -> Optional[Encoding]:
"""
Detect the most probable encoding for given byte content
Args:
content: Encoded text
Returns:
Encoding or `None` if not confident enough
Example:
>>> detector = Detector()
>>> detector.detect(b'\xc4\xe3\xba\xc3')
<Encoding.GB_K: 'gbk'>
"""
scores = self._score(content)
if scores:
encoding, score = max(scores.items(), key=lambda x: x[1])
if _apply_sigmoid(score) >= self._min_confidence:
return encoding
return None
def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
"""
Detect `top` probable encodings with confidences
Args:
content: Encoded text
top: How many of the most likely encodings to return
Example:
>>> detector = Detector()
>>> detector.probe(b'\xc4\xe3\xba\xc3')
[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
"""
scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
confidences = [
(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
]
return [
(encoding, confidence)
for encoding, confidence in confidences
if confidence >= self._min_confidence
]

@ -1,122 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import encodings.aliases
import enum
@enum.unique
class Encoding(str, enum.Enum):
"""
Python character encodings
"""
ASCII = 'ascii'
BIG_5 = 'big5'
BIG_5_HKSCS = 'big5hkscs'
CP_037 = 'cp037'
CP_273 = 'cp273'
CP_424 = 'cp424'
CP_437 = 'cp437'
CP_500 = 'cp500'
CP_720 = 'cp720'
CP_737 = 'cp737'
CP_775 = 'cp775'
CP_850 = 'cp850'
CP_852 = 'cp852'
CP_855 = 'cp855'
CP_856 = 'cp856'
CP_857 = 'cp857'
CP_858 = 'cp858'
CP_860 = 'cp860'
CP_861 = 'cp861'
CP_862 = 'cp862'
CP_863 = 'cp863'
CP_864 = 'cp864'
CP_865 = 'cp865'
CP_866 = 'cp866'
CP_869 = 'cp869'
CP_874 = 'cp874'
CP_875 = 'cp875'
CP_932 = 'cp932'
CP_949 = 'cp949'
CP_950 = 'cp950'
CP_1006 = 'cp1006'
CP_1026 = 'cp1026'
CP_1125 = 'cp1125'
CP_1140 = 'cp1140'
CP_1250 = 'cp1250'
CP_1251 = 'cp1251'
CP_1252 = 'cp1252'
CP_1253 = 'cp1253'
CP_1254 = 'cp1254'
CP_1255 = 'cp1255'
CP_1256 = 'cp1256'
CP_1257 = 'cp1257'
CP_1258 = 'cp1258'
EUC_JP = 'euc_jp'
EUC_JIS_2004 = 'euc_jis_2004'
EUC_JIS_X_0213 = 'euc_jisx0213'
EUC_KR = 'euc_kr'
GB_2312 = 'gb2312'
GB_K = 'gbk'
GB_18030 = 'gb18030'
HZ = 'hz'
ISO_2022_JP = 'iso2022_jp'
ISO_2022_JP_1 = 'iso2022_jp_1'
ISO_2022_JP_2 = 'iso2022_jp_2'
ISO_2022_JP_2004 = 'iso2022_jp_2004'
ISO_2022_JP_3 = 'iso2022_jp_3'
ISO_2022_JP_EXT = 'iso2022_jp_ext'
ISO_2022_KR = 'iso2022_kr'
LATIN_1 = 'latin_1'
ISO_8859_2 = 'iso8859_2'
ISO_8859_3 = 'iso8859_3'
ISO_8859_4 = 'iso8859_4'
ISO_8859_5 = 'iso8859_5'
ISO_8859_6 = 'iso8859_6'
ISO_8859_7 = 'iso8859_7'
ISO_8859_8 = 'iso8859_8'
ISO_8859_9 = 'iso8859_9'
ISO_8859_10 = 'iso8859_10'
ISO_8859_11 = 'iso8859_11'
ISO_8859_13 = 'iso8859_13'
ISO_8859_14 = 'iso8859_14'
ISO_8859_15 = 'iso8859_15'
ISO_8859_16 = 'iso8859_16'
JOHAB = 'johab'
KOI_8_R = 'koi8_r'
KOI_8_T = 'koi8_t'
KOI_8_U = 'koi8_u'
KZ_1048 = 'kz1048'
MAC_CYRILLIC = 'mac_cyrillic'
MAC_GREEK = 'mac_greek'
MAC_ICELAND = 'mac_iceland'
MAC_LATIN_2 = 'mac_latin2'
MAC_ROMAN = 'mac_roman'
MAC_TURKISH = 'mac_turkish'
PTCP_154 = 'ptcp154'
SHIFT_JIS = 'shift_jis'
SHIFT_JIS_2004 = 'shift_jis_2004'
SHIFT_JIS_X_0213 = 'shift_jisx0213'
TIS_620 = 'tis_620'
UTF_32 = 'utf_32'
UTF_32_BE = 'utf_32_be'
UTF_32_LE = 'utf_32_le'
UTF_16 = 'utf_16'
UTF_16_BE = 'utf_16_be'
UTF_16_LE = 'utf_16_le'
UTF_7 = 'utf_7'
UTF_8 = 'utf_8'
UTF_8_SIG = 'utf_8_sig'
@classmethod
def _missing_(cls, value):
normalized = encodings.normalize_encoding(value).lower()
normalized = encodings.aliases.aliases.get(normalized, normalized)
if value != normalized:
return cls(normalized)
return super()._missing_(value)

@ -1,72 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import gzip
import pathlib
import struct
from typing import Any, Dict, List, Sequence
from charamel.encoding import Encoding
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
"""
Unpack struct values from file
Args:
file: File that stores struct-packed values
pattern: Struct pattern
Returns:
List of unpacked values
"""
with gzip.open(file, 'rb') as data:
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
def load_features() -> Dict[int, int]:
"""
Load byte-level feature names and indices
Returns:
Mapping from features to their indices in weight matrix
"""
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
return {feature: index for index, feature in enumerate(features)}
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
"""
Load linear model bias values for given encodings
Args:
encodings: List of encodings
Returns:
Mapping from encodings to their biases
"""
biases = {}
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
for line in data:
encoding, bias = line.decode().split()
biases[encoding] = float(bias)
return {encoding: biases[encoding] for encoding in encodings}
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
"""
:param encodings:
:return:
"""
weights = {}
for encoding in encodings:
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
return weights

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save