diff --git a/bazarr/list_subtitles.py b/bazarr/list_subtitles.py index 59ab79225..607baec19 100644 --- a/bazarr/list_subtitles.py +++ b/bazarr/list_subtitles.py @@ -15,6 +15,7 @@ from subliminal_patch import search_external_subtitles from subzero.language import Language from bs4 import UnicodeDammit import six +from binaryornot.check import is_binary from get_args import args from database import database @@ -86,27 +87,6 @@ def store_subtitles(original_path, reversed_path): logging.debug("BAZARR external subtitles detected: " + str(language)) actual_subtitles.append( [str(language), path_replace_reverse(subtitle_path)]) - else: - if os.path.splitext(subtitle)[1] != ".sub": - logging.debug("BAZARR falling back to file content analysis to detect language.") - with open(os.path.join(os.path.dirname(reversed_path), subtitle), 'r') as f: - text = f.read() - try: - encoding = UnicodeDammit(text) - if six.PY2: - text = text.decode(encoding.original_encoding) - detected_language = langdetect.detect(text) - except Exception as e: - logging.exception( - 'BAZARR Error trying to detect language for this subtitles file: ' + - os.path.join(os.path.dirname(reversed_path), subtitle) + - ' You should try to delete this subtitles file manually and ask Bazarr to download it again.') - else: - if len(detected_language) > 0: - logging.debug( - "BAZARR external subtitles detected and analysis guessed this language: " + str( - detected_language)) - actual_subtitles.append([str(detected_language), path_replace_reverse(subtitle_path)]) database.execute("UPDATE table_episodes SET subtitles=? WHERE path=?", (str(actual_subtitles), original_path)) @@ -178,27 +158,6 @@ def store_subtitles_movie(original_path, reversed_path): elif str(language) != 'und': logging.debug("BAZARR external subtitles detected: " + str(language)) actual_subtitles.append([str(language), path_replace_reverse_movie(subtitle_path)]) - else: - if os.path.splitext(subtitle)[1] != ".sub": - logging.debug("BAZARR falling back to file content analysis to detect language.") - with open(os.path.join(os.path.dirname(reversed_path), dest_folder, subtitle), 'r') as f: - text = f.read() - try: - encoding = UnicodeDammit(text) - if six.PY2: - text = text.decode(encoding.original_encoding) - detected_language = langdetect.detect(text) - except Exception as e: - logging.exception( - 'BAZARR Error trying to detect language for this subtitles file: ' + - os.path.join(os.path.dirname(reversed_path), subtitle) + - ' You should try to delete this subtitles file manually and ask Bazarr to download it again.') - else: - if len(detected_language) > 0: - logging.debug( - "BAZARR external subtitles detected and analysis guessed this language: " + - str(detected_language)) - actual_subtitles.append([str(detected_language), path_replace_reverse_movie(subtitle_path)]) database.execute("UPDATE table_movies SET subtitles=? WHERE path=?", (str(actual_subtitles), original_path)) @@ -400,6 +359,10 @@ def guess_external_subtitles(dest_folder, subtitles): subtitle_path = os.path.join(dest_folder, subtitle) if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: logging.debug("BAZARR falling back to file content analysis to detect language.") + if is_binary(subtitle_path): + logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " + + subtitle_path) + continue detected_language = None with open(subtitle_path, 'r') as f: text = f.read() diff --git a/libs/binaryornot/__init__.py b/libs/binaryornot/__init__.py new file mode 100644 index 000000000..518255b16 --- /dev/null +++ b/libs/binaryornot/__init__.py @@ -0,0 +1,3 @@ +__author__ = 'Audrey Roy' +__email__ = 'audreyr@gmail.com' +__version__ = '0.4.4' diff --git a/libs/binaryornot/check.py b/libs/binaryornot/check.py new file mode 100644 index 000000000..924a65638 --- /dev/null +++ b/libs/binaryornot/check.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +""" +binaryornot.check +----------------- + +Main code for checking if a file is binary or text. +""" + +import logging +import argparse + +from binaryornot.helpers import get_starting_chunk, is_binary_string + + +logger = logging.getLogger(__name__) + + +def is_binary(filename): + """ + :param filename: File to check. + :returns: True if it's a binary file, otherwise False. + """ + logger.debug('is_binary: %(filename)r', locals()) + + # Check if the file extension is in a list of known binary types +# binary_extensions = ['.pyc', ] +# for ext in binary_extensions: +# if filename.endswith(ext): +# return True + + # Check if the starting chunk is a binary string + chunk = get_starting_chunk(filename) + return is_binary_string(chunk) + + +def main(): + parser = argparse.ArgumentParser(description="Check if a " + "file passed as argument is " + "binary or not") + + parser.add_argument("filename", help="File name to check for. If " + "the file is not in the same folder, " + "include full path") + + args = parser.parse_args() + + print(is_binary(**vars(args))) + + +if __name__ == "__main__": + main() diff --git a/libs/binaryornot/helpers.py b/libs/binaryornot/helpers.py new file mode 100644 index 000000000..978b4c374 --- /dev/null +++ b/libs/binaryornot/helpers.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + + +""" +binaryornot.helpers +------------------- + +Helper utilities used by BinaryOrNot. +""" + +import chardet +import logging + + +logger = logging.getLogger(__name__) + + +def print_as_hex(s): + """ + Print a string as hex bytes. + """ + print(":".join("{0:x}".format(ord(c)) for c in s)) + + +def get_starting_chunk(filename, length=1024): + """ + :param filename: File to open and get the first little chunk of. + :param length: Number of bytes to read, default 1024. + :returns: Starting chunk of bytes. + """ + # Ensure we open the file in binary mode + try: + with open(filename, 'rb') as f: + chunk = f.read(length) + return chunk + except IOError as e: + print(e) + + +_control_chars = b'\n\r\t\f\b' +if bytes is str: + # Python 2 means we need to invoke chr() explicitly + _printable_ascii = _control_chars + b''.join(map(chr, range(32, 127))) + _printable_high_ascii = b''.join(map(chr, range(127, 256))) +else: + # Python 3 means bytes accepts integer input directly + _printable_ascii = _control_chars + bytes(range(32, 127)) + _printable_high_ascii = bytes(range(127, 256)) + + +def is_binary_string(bytes_to_check): + """ + Uses a simplified version of the Perl detection algorithm, + based roughly on Eli Bendersky's translation to Python: + http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ + + This is biased slightly more in favour of deeming files as text + files than the Perl algorithm, since all ASCII compatible character + sets are accepted as text, not just utf-8. + + :param bytes: A chunk of bytes to check. + :returns: True if appears to be a binary, otherwise False. + """ + + # Empty files are considered text files + if not bytes_to_check: + return False + + # Now check for a high percentage of ASCII control characters + # Binary if control chars are > 30% of the string + low_chars = bytes_to_check.translate(None, _printable_ascii) + nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check)) + logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals()) + + # and check for a low percentage of high ASCII characters: + # Binary if high ASCII chars are < 5% of the string + # From: https://en.wikipedia.org/wiki/UTF-8 + # If the bytes are random, the chances of a byte with the high bit set + # starting a valid UTF-8 character is only 6.64%. The chances of finding 7 + # of these without finding an invalid sequence is actually lower than the + # chance of the first three bytes randomly being the UTF-8 BOM. + + high_chars = bytes_to_check.translate(None, _printable_high_ascii) + nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check)) + logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals()) + + if nontext_ratio1 > 0.90 and nontext_ratio2 > 0.90: + return True + + is_likely_binary = ( + (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or + (nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8) + ) + logger.debug('is_likely_binary: %(is_likely_binary)r', locals()) + + # then check for binary for possible encoding detection with chardet + detected_encoding = chardet.detect(bytes_to_check) + logger.debug('detected_encoding: %(detected_encoding)r', locals()) + + # finally use all the check to decide binary or text + decodable_as_unicode = False + if (detected_encoding['confidence'] > 0.9 and + detected_encoding['encoding'] != 'ascii'): + try: + try: + bytes_to_check.decode(encoding=detected_encoding['encoding']) + except TypeError: + # happens only on Python 2.6 + unicode(bytes_to_check, encoding=detected_encoding['encoding']) # noqa + decodable_as_unicode = True + logger.debug('success: decodable_as_unicode: ' + '%(decodable_as_unicode)r', locals()) + except LookupError: + logger.debug('failure: could not look up encoding %(encoding)s', + detected_encoding) + except UnicodeDecodeError: + logger.debug('failure: decodable_as_unicode: ' + '%(decodable_as_unicode)r', locals()) + + logger.debug('failure: decodable_as_unicode: ' + '%(decodable_as_unicode)r', locals()) + if is_likely_binary: + if decodable_as_unicode: + return False + else: + return True + else: + if decodable_as_unicode: + return False + else: + if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check: + # Check for NULL bytes last + logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check)) + return True + return False