You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
2.9 KiB
97 lines
2.9 KiB
3 years ago
|
#!/usr/bin/env python
|
||
|
|
||
|
"""Deduplicate repeated subtitles."""
|
||
|
|
||
|
import datetime
|
||
|
import srt_tools.utils
|
||
|
import logging
|
||
|
import operator
|
||
|
|
||
|
log = logging.getLogger(__name__)
|
||
|
|
||
|
try: # Python 2
|
||
|
range = xrange # pytype: disable=name-error
|
||
|
except NameError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
def parse_args():
|
||
|
examples = {
|
||
|
"Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
|
||
|
"Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
|
||
|
"Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
|
||
|
}
|
||
|
parser = srt_tools.utils.basic_parser(
|
||
|
description=__doc__,
|
||
|
examples=examples,
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"-t",
|
||
|
"--ms",
|
||
|
metavar="MILLISECONDS",
|
||
|
default=datetime.timedelta(milliseconds=5000),
|
||
|
type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
|
||
|
help="how many milliseconds distance a subtitle start time must be "
|
||
|
"within of another to be considered a duplicate "
|
||
|
"(default: 5000ms)",
|
||
|
)
|
||
|
|
||
|
return parser.parse_args()
|
||
|
|
||
|
|
||
|
def deduplicate_subs(orig_subs, acceptable_diff):
|
||
|
"""Remove subtitles with duplicated content."""
|
||
|
indices_to_remove = set()
|
||
|
|
||
|
# If we only store the subtitle itself and compare that, it's possible that
|
||
|
# we'll not only remove the duplicate, but also the _original_ subtitle if
|
||
|
# they have the same sub index/times/etc.
|
||
|
#
|
||
|
# As such, we need to also store the index in the original subs list that
|
||
|
# this entry belongs to for each subtitle prior to sorting.
|
||
|
sorted_subs = sorted(
|
||
|
enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
|
||
|
)
|
||
|
|
||
|
for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
|
||
|
cur_idx, cur_sub = subs[0]
|
||
|
next_idx, next_sub = subs[1]
|
||
|
|
||
|
if cur_sub.content == next_sub.content and (
|
||
|
not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
|
||
|
):
|
||
|
log.debug(
|
||
|
"Marking l%d/s%d for removal, duplicate of l%d/s%d",
|
||
|
next_idx,
|
||
|
next_sub.index,
|
||
|
cur_idx,
|
||
|
cur_sub.index,
|
||
|
)
|
||
|
indices_to_remove.add(next_idx)
|
||
|
|
||
|
offset = 0
|
||
|
for idx in indices_to_remove:
|
||
|
del orig_subs[idx - offset]
|
||
|
offset += 1
|
||
|
|
||
|
|
||
|
def main():
|
||
|
args = parse_args()
|
||
|
logging.basicConfig(level=args.log_level)
|
||
|
|
||
|
srt_tools.utils.set_basic_args(args)
|
||
|
|
||
|
subs = list(args.input)
|
||
|
deduplicate_subs(subs, args.ms)
|
||
|
|
||
|
output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
|
||
|
|
||
|
try:
|
||
|
args.output.write(output)
|
||
|
except (UnicodeEncodeError, TypeError): # Python 2 fallback
|
||
|
args.output.write(output.encode(args.encoding))
|
||
|
|
||
|
|
||
|
if __name__ == "__main__": # pragma: no cover
|
||
|
main()
|