#!/usr/bin/env python # -*- coding: utf-8 -*- import argparse from datetime import datetime import logging import os import shutil import subprocess import sys import numpy as np from .aligners import FFTAligner, MaxScoreAligner, FailedToFindAlignmentException from .constants import * from .ffmpeg_utils import ffmpeg_bin_path from .sklearn_shim import Pipeline from .speech_transformers import ( VideoSpeechTransformer, DeserializeSpeechTransformer, make_subtitle_speech_pipeline ) from .subtitle_parser import make_subtitle_parser from .subtitle_transformers import SubtitleMerger, SubtitleShifter from .version import get_version logger = logging.getLogger(__name__) def override(args, **kwargs): args_dict = dict(args.__dict__) args_dict.update(kwargs) return args_dict def _ref_format(ref_fname): return ref_fname[-3:] def make_test_case(args, npy_savename, sync_was_successful): if npy_savename is None: raise ValueError('need non-null npy_savename') tar_dir = '{}.{}'.format( args.reference, datetime.now().strftime('%Y-%m-%d-%H-%M-%S') ) logger.info('creating test archive {}.tar.gz...'.format(tar_dir)) os.mkdir(tar_dir) try: log_path = 'ffsubsync.log' if args.log_dir_path and os.path.isdir(args.log_dir_path): log_path = os.path.join(args.log_dir_path, log_path) shutil.copy(log_path, tar_dir) shutil.copy(args.srtin[0], tar_dir) if sync_was_successful: shutil.move(args.srtout, tar_dir) if _ref_format(args.reference) in SUBTITLE_EXTENSIONS: shutil.copy(args.reference, tar_dir) elif args.serialize_speech or args.reference == npy_savename: shutil.copy(npy_savename, tar_dir) else: shutil.move(npy_savename, tar_dir) supported_formats = set(list(zip(*shutil.get_archive_formats()))[0]) preferred_formats = ['gztar', 'bztar', 'xztar', 'zip', 'tar'] for archive_format in preferred_formats: if archive_format in supported_formats: shutil.make_archive(tar_dir, archive_format, os.curdir, tar_dir) break else: logger.error('failed to create test archive; no formats supported ' '(this should not happen)') return 1 logger.info('...done') finally: shutil.rmtree(tar_dir) return 0 def get_srt_pipe_maker(args, srtin): if srtin is None: srtin_format = 'srt' else: srtin_format = os.path.splitext(srtin)[-1][1:] parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__) return lambda scale_factor: make_subtitle_speech_pipeline( **override(args, scale_factor=scale_factor, parser=parser) ) def get_framerate_ratios_to_try(args): if args.no_fix_framerate: return [] else: framerate_ratios = list(np.concatenate([ np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS) ])) if args.gss: framerate_ratios.append(None) return framerate_ratios def try_sync(args, reference_pipe, result): sync_was_successful = True exc = None try: logger.info('extracting speech segments from %s...', 'stdin' if not args.srtin else 'subtitles file(s) {}'.format(args.srtin)) if not args.srtin: args.srtin = [None] for srtin in args.srtin: srtout = srtin if args.overwrite_input else args.srtout srt_pipe_maker = get_srt_pipe_maker(args, srtin) framerate_ratios = get_framerate_ratios_to_try(args) srt_pipes = [srt_pipe_maker(1.)] + [srt_pipe_maker(rat) for rat in framerate_ratios] for srt_pipe in srt_pipes: if callable(srt_pipe): continue else: srt_pipe.fit(srtin) if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'): inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length) srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin)) logger.info('...done') logger.info('computing alignments...') if args.skip_sync: best_score = 0. best_srt_pipe = srt_pipes[0] if callable(best_srt_pipe): best_srt_pipe = best_srt_pipe(1.0).fit(srtin) offset_samples = 0 else: (best_score, offset_samples), best_srt_pipe = MaxScoreAligner( FFTAligner, srtin, SAMPLE_RATE, args.max_offset_seconds ).fit_transform( reference_pipe.transform(args.reference), srt_pipes, ) logger.info('...done') offset_seconds = offset_samples / float(SAMPLE_RATE) + args.apply_offset_seconds scale_step = best_srt_pipe.named_steps['scale'] logger.info('score: %.3f', best_score) logger.info('offset seconds: %.3f', offset_seconds) logger.info('framerate scale factor: %.3f', scale_step.scale_factor) output_steps = [('shift', SubtitleShifter(offset_seconds))] if args.merge_with_reference: output_steps.append( ('merge', SubtitleMerger(reference_pipe.named_steps['parse'].subs_)) ) output_pipe = Pipeline(output_steps) out_subs = output_pipe.fit_transform(scale_step.subs_) if args.output_encoding != 'same': out_subs = out_subs.set_encoding(args.output_encoding) logger.info('writing output to {}'.format(srtout or 'stdout')) out_subs.write_file(srtout) except FailedToFindAlignmentException as e: sync_was_successful = False logger.error(e) except Exception as e: exc = e sync_was_successful = False logger.error(e) else: result['offset_seconds'] = offset_seconds result['framerate_scale_factor'] = scale_step.scale_factor finally: if exc is not None: raise exc result['sync_was_successful'] = sync_was_successful return sync_was_successful def make_reference_pipe(args): ref_format = _ref_format(args.reference) if ref_format in SUBTITLE_EXTENSIONS: if args.vad is not None: logger.warning('Vad specified, but reference was not a movie') return make_subtitle_speech_pipeline( fmt=ref_format, **override( args, encoding=args.reference_encoding or DEFAULT_ENCODING ) ) elif ref_format in ('npy', 'npz'): if args.vad is not None: logger.warning('Vad specified, but reference was not a movie') return Pipeline([ ('deserialize', DeserializeSpeechTransformer(args.non_speech_label)) ]) else: vad = args.vad or DEFAULT_VAD if args.reference_encoding is not None: logger.warning('Reference srt encoding specified, but reference was a video file') ref_stream = args.reference_stream if ref_stream is not None and not ref_stream.startswith('0:'): ref_stream = '0:' + ref_stream return Pipeline([ ('speech_extract', VideoSpeechTransformer( vad=vad, sample_rate=SAMPLE_RATE, frame_rate=args.frame_rate, non_speech_label=args.non_speech_label, start_seconds=args.start_seconds, ffmpeg_path=args.ffmpeg_path, ref_stream=ref_stream, vlc_mode=args.vlc_mode, gui_mode=args.gui_mode )), ]) def extract_subtitles_from_reference(args): stream = args.extract_subs_from_stream if not stream.startswith('0:s:'): stream = '0:s:{}'.format(stream) elif not stream.startswith('0:') and stream.startswith('s:'): stream = '0:{}'.format(stream) if not stream.startswith('0:s:'): logger.error('invalid stream for subtitle extraction: %s', args.extract_subs_from_stream) ffmpeg_args = [ffmpeg_bin_path('ffmpeg', args.gui_mode, ffmpeg_resources_path=args.ffmpeg_path)] ffmpeg_args.extend([ '-y', '-nostdin', '-loglevel', 'fatal', '-i', args.reference, '-map', '{}'.format(stream), '-f', 'srt', ]) if args.srtout is None: ffmpeg_args.append('-') else: ffmpeg_args.append(args.srtout) logger.info('attempting to extract subtitles to {} ...'.format('stdout' if args.srtout is None else args.srtout)) retcode = subprocess.call(ffmpeg_args) if retcode == 0: logger.info('...done') else: logger.error('ffmpeg unable to extract subtitles from reference; return code %d', retcode) return retcode def validate_args(args): if args.vlc_mode: logger.setLevel(logging.CRITICAL) if len(args.srtin) > 1 and not args.overwrite_input: raise ValueError('cannot specify multiple input srt files without overwriting') if len(args.srtin) > 1 and args.make_test_case: raise ValueError('cannot specify multiple input srt files for test cases') if len(args.srtin) > 1 and args.gui_mode: raise ValueError('cannot specify multiple input srt files in GUI mode') if args.make_test_case and not args.gui_mode: # this validation not necessary for gui mode if args.srtin is None or args.srtout is None: raise ValueError('need to specify input and output srt files for test cases') if args.overwrite_input: if args.extract_subs_from_stream is not None: raise ValueError('input overwriting not allowed for extracting subtitles from reference') if not args.srtin: raise ValueError( 'need to specify input srt if --overwrite-input is specified since we cannot overwrite stdin' ) if args.srtout is not None: raise ValueError( 'overwrite input set but output file specified; refusing to run in case this was not intended' ) if args.extract_subs_from_stream is not None: if args.make_test_case: raise ValueError('test case is for sync and not subtitle extraction') if args.srtin: raise ValueError('stream specified for reference subtitle extraction; -i flag for sync input not allowed') def validate_file_permissions(args): error_string_template = 'unable to {action} {file}; try ensuring file exists and has correct permissions' if not os.access(args.reference, os.R_OK): raise ValueError(error_string_template.format(action='read reference', file=args.reference)) for srtin in args.srtin: if srtin is not None and not os.access(srtin, os.R_OK): raise ValueError(error_string_template.format(action='read input subtitles', file=srtin)) if args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK): raise ValueError(error_string_template.format(action='write output subtitles', file=args.srtout)) if args.make_test_case or args.serialize_speech: npy_savename = os.path.splitext(args.reference)[0] + '.npz' if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK): raise ValueError('unable to write test case file archive %s (try checking permissions)' % npy_savename) def run(args): result = { 'retval': 0, 'offset_seconds': None, 'framerate_scale_factor': None, 'sync_was_successful': None } try: validate_args(args) except ValueError as e: logger.error(e) result['retval'] = 1 return result if args.gui_mode and args.srtout is None: args.srtout = '{}.synced.srt'.format(os.path.splitext(args.srtin[0])[0]) try: validate_file_permissions(args) except ValueError as e: logger.error(e) result['retval'] = 1 return result ref_format = _ref_format(args.reference) if args.merge_with_reference and ref_format not in SUBTITLE_EXTENSIONS: logger.error('merging synced output with reference only valid ' 'when reference composed of subtitles') result['retval'] = 1 return result log_handler = None log_path = None if args.make_test_case: log_path = 'ffsubsync.log' if args.log_dir_path and os.path.isdir(args.log_dir_path): log_path = os.path.join(args.log_dir_path, log_path) log_handler = logging.FileHandler(log_path) logger.addHandler(log_handler) if args.extract_subs_from_stream is not None: result['retval'] = extract_subtitles_from_reference(args) return result reference_pipe = make_reference_pipe(args) logger.info("extracting speech segments from reference '%s'...", args.reference) reference_pipe.fit(args.reference) logger.info('...done') npy_savename = None if args.make_test_case or args.serialize_speech: logger.info('serializing speech...') npy_savename = os.path.splitext(args.reference)[0] + '.npz' np.savez_compressed(npy_savename, speech=reference_pipe.transform(args.reference)) logger.info('...done') if args.srtin[0] is None: logger.info('unsynchronized subtitle file not specified; skipping synchronization') return result sync_was_successful = try_sync(args, reference_pipe, result) if log_handler is not None and log_path is not None: assert args.make_test_case log_handler.close() logger.removeHandler(log_handler) try: result['retval'] += make_test_case(args, npy_savename, sync_was_successful) finally: os.remove(log_path) return result def add_main_args_for_cli(parser): parser.add_argument( 'reference', help='Reference (video, subtitles, or a numpy array with VAD speech) to which to synchronize input subtitles.' ) parser.add_argument('-i', '--srtin', nargs='*', help='Input subtitles file (default=stdin).') parser.add_argument('-o', '--srtout', help='Output subtitles file (default=stdout).') parser.add_argument('--merge-with-reference', '--merge', action='store_true', help='Merge reference subtitles with synced output subtitles.') parser.add_argument('--make-test-case', '--create-test-case', action='store_true', help='If specified, serialize reference speech to a numpy array, ' 'and create an archive with input/output subtitles ' 'and serialized speech.') parser.add_argument( '--reference-stream', '--refstream', '--reference-track', '--reftrack', default=None, help='Which stream/track in the video file to use as reference, ' 'formatted according to ffmpeg conventions. For example, 0:s:0 ' 'uses the first subtitle track; 0:a:3 would use the third audio track. ' 'You can also drop the leading `0:`; i.e. use s:0 or a:3, respectively. ' 'Example: `ffs ref.mkv -i in.srt -o out.srt --reference-stream s:2`' ) def add_cli_only_args(parser): parser.add_argument('-v', '--version', action='version', version='{package} {version}'.format(package=__package__, version=get_version())) parser.add_argument('--overwrite-input', action='store_true', help='If specified, will overwrite the input srt instead of writing the output to a new file.') parser.add_argument('--encoding', default=DEFAULT_ENCODING, help='What encoding to use for reading input subtitles ' '(default=%s).' % DEFAULT_ENCODING) parser.add_argument('--max-subtitle-seconds', type=float, default=DEFAULT_MAX_SUBTITLE_SECONDS, help='Maximum duration for a subtitle to appear on-screen ' '(default=%.3f seconds).' % DEFAULT_MAX_SUBTITLE_SECONDS) parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS, help='Start time for processing ' '(default=%d seconds).' % DEFAULT_START_SECONDS) parser.add_argument('--max-offset-seconds', type=float, default=DEFAULT_MAX_OFFSET_SECONDS, help='The max allowed offset seconds for any subtitle segment ' '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS) parser.add_argument('--apply-offset-seconds', type=float, default=DEFAULT_APPLY_OFFSET_SECONDS, help='Apply a predefined offset in seconds to all subtitle segments ' '(default=%d seconds).' % DEFAULT_APPLY_OFFSET_SECONDS) parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE, help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE) parser.add_argument('--skip-infer-framerate-ratio', action='store_true', help='If set, do not try to infer framerate ratio based on duration ratio.') parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL, help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL) parser.add_argument('--output-encoding', default='utf-8', help='What encoding to use for writing output subtitles ' '(default=utf-8). Can indicate "same" to use same ' 'encoding as that of the input.') parser.add_argument('--reference-encoding', help='What encoding to use for reading / writing reference subtitles ' '(if applicable, default=infer).') parser.add_argument('--vad', choices=['subs_then_webrtc', 'webrtc', 'subs_then_auditok', 'auditok'], default=None, help='Which voice activity detector to use for speech extraction ' '(if using video / audio as a reference, default={}).'.format(DEFAULT_VAD)) parser.add_argument('--no-fix-framerate', action='store_true', help='If specified, subsync will not attempt to correct a framerate ' 'mismatch between reference and subtitles.') parser.add_argument('--serialize-speech', action='store_true', help='If specified, serialize reference speech to a numpy array.') parser.add_argument('--extract-subs-from-stream', default=None, help='If specified, do not attempt sync; instead, just extract subtitles' ' from the specified stream using the reference.') parser.add_argument( '--ffmpeg-path', '--ffmpegpath', default=None, help='Where to look for ffmpeg and ffprobe. Uses the system PATH by default.' ) parser.add_argument('--log-dir-path', default=None, help='Where to save ffsubsync.log file (must be an existing ' 'directory).') parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS) parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS) parser.add_argument('--skip-sync', action='store_true', help=argparse.SUPPRESS) parser.add_argument('--gss', action='store_true', help=argparse.SUPPRESS) def make_parser(): parser = argparse.ArgumentParser(description='Synchronize subtitles with video.') add_main_args_for_cli(parser) add_cli_only_args(parser) return parser def main(): parser = make_parser() args = parser.parse_args() return run(args)['retval'] if __name__ == "__main__": sys.exit(main())