bazarr/libs/binaryornot/helpers.py

# -*- coding: utf-8 -*-


"""
binaryornot.helpers
-------------------

Helper utilities used by BinaryOrNot.
"""

import chardet
import logging


logger = logging.getLogger(__name__)


def print_as_hex(s):
    """
    Print a string as hex bytes.
    """
    print(":".join("{0:x}".format(ord(c)) for c in s))


def get_starting_chunk(filename, length=1024):
    """
    :param filename: File to open and get the first little chunk of.
    :param length: Number of bytes to read, default 1024.
    :returns: Starting chunk of bytes.
    """
    # Ensure we open the file in binary mode
    try:
        with open(filename, 'rb') as f:
            chunk = f.read(length)
            return chunk
    except IOError as e:
        print(e)


_control_chars = b'\n\r\t\f\b'
if bytes is str:
    # Python 2 means we need to invoke chr() explicitly
    _printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))
    _printable_high_ascii = b''.join(map(chr, range(127, 256)))
else:
    # Python 3 means bytes accepts integer input directly
    _printable_ascii = _control_chars + bytes(range(32, 127))
    _printable_high_ascii = bytes(range(127, 256))


def is_binary_string(bytes_to_check):
    """
    Uses a simplified version of the Perl detection algorithm,
    based roughly on Eli Bendersky's translation to Python:
    http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/

    This is biased slightly more in favour of deeming files as text
    files than the Perl algorithm, since all ASCII compatible character
    sets are accepted as text, not just utf-8.

    :param bytes: A chunk of bytes to check.
    :returns: True if appears to be a binary, otherwise False.
    """

    # Empty files are considered text files
    if not bytes_to_check:
        return False

    # Now check for a high percentage of ASCII control characters
    # Binary if control chars are > 30% of the string
    low_chars = bytes_to_check.translate(None, _printable_ascii)
    nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
    logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())

    # and check for a low percentage of high ASCII characters:
    # Binary if high ASCII chars are < 5% of the string
    # From: https://en.wikipedia.org/wiki/UTF-8
    # If the bytes are random, the chances of a byte with the high bit set
    # starting a valid UTF-8 character is only 6.64%. The chances of finding 7
    # of these without finding an invalid sequence is actually lower than the
    # chance of the first three bytes randomly being the UTF-8 BOM.

    high_chars = bytes_to_check.translate(None, _printable_high_ascii)
    nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
    logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())

    if nontext_ratio1 > 0.90 and nontext_ratio2 > 0.90:
        return True

    is_likely_binary = (
        (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or
        (nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)
    )
    logger.debug('is_likely_binary: %(is_likely_binary)r', locals())

    # then check for binary for possible encoding detection with chardet
    detected_encoding = chardet.detect(bytes_to_check)
    logger.debug('detected_encoding: %(detected_encoding)r', locals())

    # finally use all the check to decide binary or text
    decodable_as_unicode = False
    if (detected_encoding['confidence'] > 0.9 and
            detected_encoding['encoding'] != 'ascii'):
        try:
            try:
                bytes_to_check.decode(encoding=detected_encoding['encoding'])
            except TypeError:
                # happens only on Python 2.6
                unicode(bytes_to_check, encoding=detected_encoding['encoding'])  # noqa
            decodable_as_unicode = True
            logger.debug('success: decodable_as_unicode: '
                         '%(decodable_as_unicode)r', locals())
        except LookupError:
            logger.debug('failure: could not look up encoding %(encoding)s',
                         detected_encoding)
        except UnicodeDecodeError:
            logger.debug('failure: decodable_as_unicode: '
                         '%(decodable_as_unicode)r', locals())

    logger.debug('failure: decodable_as_unicode: '
                 '%(decodable_as_unicode)r', locals())
    if is_likely_binary:
        if decodable_as_unicode:
            return False
        else:
            return True
    else:
        if decodable_as_unicode:
            return False
        else:
            if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:
                # Check for NULL bytes last
                logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))
                return True
        return False
Fix for subtitles file without a language code that aren't text based. 5 years ago			`# -- coding: utf-8 --`


			`"""`
			`binaryornot.helpers`
			`-------------------`

			`Helper utilities used by BinaryOrNot.`
			`"""`

			`import chardet`
			`import logging`


			`logger = logging.getLogger(__name__)`


			`def print_as_hex(s):`
			`"""`
			`Print a string as hex bytes.`
			`"""`
			`print(":".join("{0:x}".format(ord(c)) for c in s))`


			`def get_starting_chunk(filename, length=1024):`
			`"""`
			`:param filename: File to open and get the first little chunk of.`
			`:param length: Number of bytes to read, default 1024.`
			`:returns: Starting chunk of bytes.`
			`"""`
			`# Ensure we open the file in binary mode`
			`try:`
			`with open(filename, 'rb') as f:`
			`chunk = f.read(length)`
			`return chunk`
			`except IOError as e:`
			`print(e)`


			`_control_chars = b'\n\r\t\f\b'`
			`if bytes is str:`
			`# Python 2 means we need to invoke chr() explicitly`
			`_printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))`
			`_printable_high_ascii = b''.join(map(chr, range(127, 256)))`
			`else:`
			`# Python 3 means bytes accepts integer input directly`
			`_printable_ascii = _control_chars + bytes(range(32, 127))`
			`_printable_high_ascii = bytes(range(127, 256))`


			`def is_binary_string(bytes_to_check):`
			`"""`
			`Uses a simplified version of the Perl detection algorithm,`
			`based roughly on Eli Bendersky's translation to Python:`
			`http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/`

			`This is biased slightly more in favour of deeming files as text`
			`files than the Perl algorithm, since all ASCII compatible character`
			`sets are accepted as text, not just utf-8.`

			`:param bytes: A chunk of bytes to check.`
			`:returns: True if appears to be a binary, otherwise False.`
			`"""`

			`# Empty files are considered text files`
			`if not bytes_to_check:`
			`return False`

			`# Now check for a high percentage of ASCII control characters`
			`# Binary if control chars are > 30% of the string`
			`low_chars = bytes_to_check.translate(None, _printable_ascii)`
			`nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))`
			`logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())`

			`# and check for a low percentage of high ASCII characters:`
			`# Binary if high ASCII chars are < 5% of the string`
			`# From: https://en.wikipedia.org/wiki/UTF-8`
			`# If the bytes are random, the chances of a byte with the high bit set`
			`# starting a valid UTF-8 character is only 6.64%. The chances of finding 7`
			`# of these without finding an invalid sequence is actually lower than the`
			`# chance of the first three bytes randomly being the UTF-8 BOM.`

			`high_chars = bytes_to_check.translate(None, _printable_high_ascii)`
			`nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))`
			`logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())`

			`if nontext_ratio1 > 0.90 and nontext_ratio2 > 0.90:`
			`return True`

			`is_likely_binary = (`
			`(nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or`
			`(nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)`
			`)`
			`logger.debug('is_likely_binary: %(is_likely_binary)r', locals())`

			`# then check for binary for possible encoding detection with chardet`
			`detected_encoding = chardet.detect(bytes_to_check)`
			`logger.debug('detected_encoding: %(detected_encoding)r', locals())`

			`# finally use all the check to decide binary or text`
			`decodable_as_unicode = False`
			`if (detected_encoding['confidence'] > 0.9 and`
			`detected_encoding['encoding'] != 'ascii'):`
			`try:`
			`try:`
			`bytes_to_check.decode(encoding=detected_encoding['encoding'])`
			`except TypeError:`
			`# happens only on Python 2.6`
			`unicode(bytes_to_check, encoding=detected_encoding['encoding']) # noqa`
			`decodable_as_unicode = True`
			`logger.debug('success: decodable_as_unicode: '`
			`'%(decodable_as_unicode)r', locals())`
			`except LookupError:`
			`logger.debug('failure: could not look up encoding %(encoding)s',`
			`detected_encoding)`
			`except UnicodeDecodeError:`
			`logger.debug('failure: decodable_as_unicode: '`
			`'%(decodable_as_unicode)r', locals())`

			`logger.debug('failure: decodable_as_unicode: '`
			`'%(decodable_as_unicode)r', locals())`
			`if is_likely_binary:`
			`if decodable_as_unicode:`
			`return False`
			`else:`
			`return True`
			`else:`
			`if decodable_as_unicode:`
			`return False`
			`else:`
			`if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:`
			`# Check for NULL bytes last`
			`logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))`
			`return True`
			`return False`