bazarr/libs/srt_tools/utils.py

#!/usr/bin/env python

import argparse
import codecs
import srt
import logging
import sys
import itertools
import collections
import os

PROG_NAME = os.path.basename(sys.argv[0]).replace("-", " ", 1)

STDIN_BYTESTREAM = getattr(sys.stdin, "buffer", sys.stdin)
STDOUT_BYTESTREAM = getattr(sys.stdout, "buffer", sys.stdout)

DASH_STREAM_MAP = {"input": STDIN_BYTESTREAM, "output": STDOUT_BYTESTREAM}

log = logging.getLogger(__name__)


def noop(stream):
    """
    Used when we didn't explicitly specify a stream to avoid using
    codecs.get{reader,writer}
    """
    return stream


def dash_to_stream(arg, arg_type):
    if arg == "-":
        return DASH_STREAM_MAP[arg_type]
    return arg


def basic_parser(
    description=None,
    multi_input=False,
    no_output=False,
    examples=None,
    hide_no_strict=False,
):
    example_lines = []

    if examples is not None:
        example_lines.append("examples:")

        for desc, code in examples.items():
            example_lines.append("  {}".format(desc))
            example_lines.append("    $ {}\n".format(code))

    parser = argparse.ArgumentParser(
        prog=PROG_NAME,
        description=description,
        epilog="\n".join(example_lines),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    # Cannot use argparse.FileType as we need to know the encoding from the
    # args

    if multi_input:
        parser.add_argument(
            "--input",
            "-i",
            metavar="FILE",
            action="append",
            type=lambda arg: dash_to_stream(arg, "input"),
            help="the files to process",
            required=True,
        )
    else:
        parser.add_argument(
            "--input",
            "-i",
            metavar="FILE",
            default=STDIN_BYTESTREAM,
            type=lambda arg: dash_to_stream(arg, "input"),
            help="the file to process (default: stdin)",
        )

    if not no_output:
        parser.add_argument(
            "--output",
            "-o",
            metavar="FILE",
            default=STDOUT_BYTESTREAM,
            type=lambda arg: dash_to_stream(arg, "output"),
            help="the file to write to (default: stdout)",
        )
        if not multi_input:
            parser.add_argument(
                "--inplace",
                "-p",
                action="store_true",
                help="modify file in place",
            )

    shelp = "allow blank lines in output, your media player may explode"
    if hide_no_strict:
        shelp = argparse.SUPPRESS

    parser.add_argument("--no-strict", action="store_false", dest="strict", help=shelp)
    parser.add_argument(
        "--debug",
        action="store_const",
        dest="log_level",
        const=logging.DEBUG,
        default=logging.INFO,
        help="enable debug logging",
    )

    parser.add_argument(
        "--ignore-parsing-errors",
        "-c",
        action="store_true",
        help="try to keep going, even if there are parsing errors",
    )

    parser.add_argument(
        "--encoding", "-e", help="the encoding to read/write files in (default: utf8)"
    )
    return parser


def set_basic_args(args):
    # TODO: dedupe some of this
    if getattr(args, "inplace", None):
        if args.input == DASH_STREAM_MAP["input"]:
            raise ValueError("Cannot use --inplace on stdin")

        if args.output != DASH_STREAM_MAP["output"]:
            raise ValueError("Cannot use -o and -p together")

        args.output = args.input

    for stream_name in ("input", "output"):
        log.debug('Processing stream "%s"', stream_name)

        try:
            stream = getattr(args, stream_name)
        except AttributeError:
            # For example, in the case of no_output
            continue

        # We don't use system default encoding, because usually one runs this
        # on files they got from elsewhere. As such, be opinionated that these
        # files are probably UTF-8. Looking for the BOM on reading allows us to
        # be more liberal with what we accept, without adding BOMs on write.
        read_encoding = args.encoding or "utf-8-sig"
        write_encoding = args.encoding or "utf-8"

        r_enc = codecs.getreader(read_encoding)
        w_enc = codecs.getwriter(write_encoding)

        log.debug("Got %r as stream", stream)
        # We don't use encoding= option to open because we want to have the
        # same universal newlines behaviour as STD{IN,OUT}_BYTESTREAM
        if stream in DASH_STREAM_MAP.values():
            log.debug("%s in DASH_STREAM_MAP", stream_name)
            if stream is args.input:
                args.input = srt.parse(
                    r_enc(args.input).read(), ignore_errors=args.ignore_parsing_errors
                )
            elif stream is args.output:
                # Since args.output is not in text mode (since we didn't
                # earlier know the encoding), we have no universal newline
                # support and need to do it ourselves
                args.output = w_enc(args.output)
        else:
            log.debug("%s not in DASH_STREAM_MAP", stream_name)
            if stream is args.input:
                if isinstance(args.input, collections.MutableSequence):
                    for i, input_fn in enumerate(args.input):
                        if input_fn in DASH_STREAM_MAP.values():
                            if stream is args.input:
                                args.input[i] = srt.parse(
                                    r_enc(input_fn).read(),
                                    ignore_errors=args.ignore_parsing_errors,
                                )
                        else:
                            f = r_enc(open(input_fn, "rb"))
                            with f:
                                args.input[i] = srt.parse(
                                    f.read(), ignore_errors=args.ignore_parsing_errors
                                )
                else:
                    f = r_enc(open(stream, "rb"))
                    with f:
                        args.input = srt.parse(
                            f.read(), ignore_errors=args.ignore_parsing_errors
                        )
            else:
                args.output = w_enc(open(args.output, "wb"))


def compose_suggest_on_fail(subs, strict=True):
    try:
        return srt.compose(subs, strict=strict, eol=os.linesep, in_place=True)
    except srt.SRTParseError as thrown_exc:
        # Since `subs` is actually a generator
        log.critical(
            "Parsing failed, maybe you need to pass a different encoding "
            "with --encoding?"
        )
        raise


def sliding_window(seq, width=2):
    seq_iter = iter(seq)
    sliced = tuple(itertools.islice(seq_iter, width))

    if len(sliced) == width:
        yield sliced

    for elem in seq_iter:
        sliced = sliced[1:] + (elem,)
        yield sliced