bazarr/libs/srt_tools/srt-deduplicate

#!/usr/bin/env python

"""Deduplicate repeated subtitles."""

import datetime
import srt_tools.utils
import logging
import operator

log = logging.getLogger(__name__)

try:  # Python 2
    range = xrange  # pytype: disable=name-error
except NameError:
    pass


def parse_args():
    examples = {
        "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
        "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
        "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
    }
    parser = srt_tools.utils.basic_parser(
        description=__doc__,
        examples=examples,
    )
    parser.add_argument(
        "-t",
        "--ms",
        metavar="MILLISECONDS",
        default=datetime.timedelta(milliseconds=5000),
        type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
        help="how many milliseconds distance a subtitle start time must be "
        "within of another to be considered a duplicate "
        "(default: 5000ms)",
    )

    return parser.parse_args()


def deduplicate_subs(orig_subs, acceptable_diff):
    """Remove subtitles with duplicated content."""
    indices_to_remove = set()

    # If we only store the subtitle itself and compare that, it's possible that
    # we'll not only remove the duplicate, but also the _original_ subtitle if
    # they have the same sub index/times/etc.
    #
    # As such, we need to also store the index in the original subs list that
    # this entry belongs to for each subtitle prior to sorting.
    sorted_subs = sorted(
        enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
    )

    for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
        cur_idx, cur_sub = subs[0]
        next_idx, next_sub = subs[1]

        if cur_sub.content == next_sub.content and (
            not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
        ):
            log.debug(
                "Marking l%d/s%d for removal, duplicate of l%d/s%d",
                next_idx,
                next_sub.index,
                cur_idx,
                cur_sub.index,
            )
            indices_to_remove.add(next_idx)

    offset = 0
    for idx in indices_to_remove:
        del orig_subs[idx - offset]
        offset += 1


def main():
    args = parse_args()
    logging.basicConfig(level=args.log_level)

    srt_tools.utils.set_basic_args(args)

    subs = list(args.input)
    deduplicate_subs(subs, args.ms)

    output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)

    try:
        args.output.write(output)
    except (UnicodeEncodeError, TypeError):  # Python 2 fallback
        args.output.write(output.encode(args.encoding))


if __name__ == "__main__":  # pragma: no cover
    main()
Upgraded vendored Python dependencies to the latest versions and removed the unused dependencies. 3 years ago			`#!/usr/bin/env python`

			`"""Deduplicate repeated subtitles."""`

			`import datetime`
			`import srt_tools.utils`
			`import logging`
			`import operator`

			`log = logging.getLogger(__name__)`

			`try: # Python 2`
			`range = xrange # pytype: disable=name-error`
			`except NameError:`
			`pass`


			`def parse_args():`
			`examples = {`
			`"Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",`
			`"Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",`
			`"Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",`
			`}`
			`parser = srt_tools.utils.basic_parser(`
			`description=__doc__,`
			`examples=examples,`
			`)`
			`parser.add_argument(`
			`"-t",`
			`"--ms",`
			`metavar="MILLISECONDS",`
			`default=datetime.timedelta(milliseconds=5000),`
			`type=lambda ms: datetime.timedelta(milliseconds=int(ms)),`
			`help="how many milliseconds distance a subtitle start time must be "`
			`"within of another to be considered a duplicate "`
			`"(default: 5000ms)",`
			`)`

			`return parser.parse_args()`


			`def deduplicate_subs(orig_subs, acceptable_diff):`
			`"""Remove subtitles with duplicated content."""`
			`indices_to_remove = set()`

			`# If we only store the subtitle itself and compare that, it's possible that`
			`# we'll not only remove the duplicate, but also the _original_ subtitle if`
			`# they have the same sub index/times/etc.`
			`#`
			`# As such, we need to also store the index in the original subs list that`
			`# this entry belongs to for each subtitle prior to sorting.`
			`sorted_subs = sorted(`
			`enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)`
			`)`

			`for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):`
			`cur_idx, cur_sub = subs[0]`
			`next_idx, next_sub = subs[1]`

			`if cur_sub.content == next_sub.content and (`
			`not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start`
			`):`
			`log.debug(`
			`"Marking l%d/s%d for removal, duplicate of l%d/s%d",`
			`next_idx,`
			`next_sub.index,`
			`cur_idx,`
			`cur_sub.index,`
			`)`
			`indices_to_remove.add(next_idx)`

			`offset = 0`
			`for idx in indices_to_remove:`
			`del orig_subs[idx - offset]`
			`offset += 1`


			`def main():`
			`args = parse_args()`
			`logging.basicConfig(level=args.log_level)`

			`srt_tools.utils.set_basic_args(args)`

			`subs = list(args.input)`
			`deduplicate_subs(subs, args.ms)`

			`output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)`

			`try:`
			`args.output.write(output)`
			`except (UnicodeEncodeError, TypeError): # Python 2 fallback`
			`args.output.write(output.encode(args.encoding))`


			`if __name__ == "__main__": # pragma: no cover`
			`main()`