Merge pull request #864 from josdion/development

Convert subtitle text to unicode before sending to guess_language
pull/871/head
morpheus65535 5 years ago committed by GitHub
commit 3bd75b19ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,9 +13,7 @@ import operator
from subliminal import core from subliminal import core
from subliminal_patch import search_external_subtitles from subliminal_patch import search_external_subtitles
from subzero.language import Language from subzero.language import Language
from bs4 import UnicodeDammit
import six import six
from binaryornot.check import is_binary
from get_args import args from get_args import args
from database import database from database import database
@ -27,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \
from queueconfig import notifications from queueconfig import notifications
from embedded_subs_reader import embedded_subs_reader from embedded_subs_reader import embedded_subs_reader
import six import six
import chardet
gc.enable() gc.enable()
@ -367,25 +366,29 @@ def guess_external_subtitles(dest_folder, subtitles):
subtitle_path = os.path.join(dest_folder, subtitle) subtitle_path = os.path.join(dest_folder, subtitle)
if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
logging.debug("BAZARR falling back to file content analysis to detect language.") logging.debug("BAZARR falling back to file content analysis to detect language.")
if is_binary(subtitle_path): detected_language = None
logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
# to improve performance, skip detection of files larger that 5M
if os.path.getsize(subtitle_path) > 5*1024*1024:
logging.debug("BAZARR subtitles file is too large to be text based. Skipping this file: " +
subtitle_path) subtitle_path)
continue continue
detected_language = None
if six.PY3: with open(subtitle_path, 'rb') as f:
with open(subtitle_path, 'r', errors='ignore') as f:
text = f.read()
else:
with open(subtitle_path, 'r') as f:
text = f.read() text = f.read()
try: try:
encoding = UnicodeDammit(text) # to improve performance, use only the first 8K to detect encoding
if six.PY2: if len(text) > 8192: guess = chardet.detect(text[:8192])
text = text.decode(encoding.original_encoding) else: guess = chardet.detect(text)
if guess["confidence"] < 0.8:
raise UnicodeError
text = text.decode(guess["encoding"])
detected_language = guess_language(text) detected_language = guess_language(text)
except Exception as e: except UnicodeError:
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
subtitle_path)
except:
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' + logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
subtitle_path + ' You should try to delete this subtitles file manually and ask ' subtitle_path + ' You should try to delete this subtitles file manually and ask '
'Bazarr to download it again.') 'Bazarr to download it again.')

Loading…
Cancel
Save