Use the whole subtitle file to detect encoding. Seems like if trying to detect encoding only by a part of the file, is not always working. This modification will lead to worse performance, but should be more reliable.
pull/1038/head
josdion 4 years ago
parent ca0123c036
commit d06cace2d6

@ -364,8 +364,8 @@ def guess_external_subtitles(dest_folder, subtitles):
logging.debug("BAZARR falling back to file content analysis to detect language.")
detected_language = None
# to improve performance, skip detection of files larger that 5M
if os.path.getsize(subtitle_path) > 5*1024*1024:
# to improve performance, skip detection of files larger that 1M
if os.path.getsize(subtitle_path) > 1*1024*1024:
logging.debug("BAZARR subtitles file is too large to be text based. Skipping this file: " +
subtitle_path)
continue
@ -374,16 +374,11 @@ def guess_external_subtitles(dest_folder, subtitles):
text = f.read()
try:
# to improve performance, use only the first 32K to detect encoding
guess = chardet.detect(text[:32768])
guess = chardet.detect(text)
logging.debug('BAZARR detected encoding %r', guess)
if guess["confidence"] < 0.6:
raise UnicodeError
if guess["encoding"] == "ascii":
guess["encoding"] = "utf-8"
text = text.decode(guess["encoding"])
detected_language = guess_language(text)
except UnicodeError:
except (UnicodeDecodeError, TypeError):
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
subtitle_path)
except:

@ -606,7 +606,7 @@ def _search_external_subtitles(path, languages=None, only_one=False, scandir_gen
continue
if p_root.lower() == fn_no_ext_lower:
# skip check for language code is the subtitle file name is the same as the video name
# skip check for language code if the subtitle file name is the same as the video name
subtitles[p] = None
continue

Loading…
Cancel
Save