|
|
|
#!/usr/bin/env python
|
|
|
|
# encoding: utf-8
|
|
|
|
"""
|
|
|
|
`auditok` -- An Audio Activity Detection tool
|
|
|
|
|
|
|
|
`auditok` is a program that can be used for Audio/Acoustic
|
|
|
|
activity detection. It can read audio data from audio files as well
|
|
|
|
as from the microphone or standard input.
|
|
|
|
|
|
|
|
@author: Mohamed El Amine SEHILI
|
|
|
|
@copyright: 2015-2021 Mohamed El Amine SEHILI
|
|
|
|
@license: MIT
|
|
|
|
@contact: amine.sehili@gmail.com
|
|
|
|
@deffield updated: 01 Mar 2021
|
|
|
|
"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
from argparse import ArgumentParser
|
|
|
|
import time
|
|
|
|
import threading
|
|
|
|
|
|
|
|
from auditok import __version__, AudioRegion
|
|
|
|
from .util import AudioDataSource
|
|
|
|
from .exceptions import EndOfProcessing, AudioEncodingWarning
|
|
|
|
from .io import player_for
|
|
|
|
from .cmdline_util import make_logger, make_kwargs, initialize_workers
|
|
|
|
from . import workers
|
|
|
|
|
|
|
|
|
|
|
|
__all__ = []
|
|
|
|
__date__ = "2015-11-23"
|
|
|
|
__updated__ = "2021-03-01"
|
|
|
|
|
|
|
|
|
|
|
|
def main(argv=None):
|
|
|
|
program_name = os.path.basename(sys.argv[0])
|
|
|
|
if argv is None:
|
|
|
|
argv = sys.argv[1:]
|
|
|
|
try:
|
|
|
|
parser = ArgumentParser(
|
|
|
|
prog=program_name, description="An Audio Tokenization tool"
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--version", "-v", action="version", version=__version__
|
|
|
|
)
|
|
|
|
group = parser.add_argument_group("Input-Output options")
|
|
|
|
group.add_argument(
|
|
|
|
dest="input",
|
|
|
|
help="Input audio or video file. Use '-' for stdin "
|
|
|
|
"[default: read from microphone using pyaudio]",
|
|
|
|
metavar="input",
|
|
|
|
nargs="?",
|
|
|
|
default=None,
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-I",
|
|
|
|
"--input-device-index",
|
|
|
|
dest="input_device_index",
|
|
|
|
help="Audio device index [default: %(default)s]. "
|
|
|
|
"Optional and only effective when using PyAudio",
|
|
|
|
type=int,
|
|
|
|
default=None,
|
|
|
|
metavar="INT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-F",
|
|
|
|
"--audio-frame-per-buffer",
|
|
|
|
dest="frame_per_buffer",
|
|
|
|
help="Audio frame per buffer [default: %(default)s]. "
|
|
|
|
"Optional and only effective when using PyAudio",
|
|
|
|
type=int,
|
|
|
|
default=1024,
|
|
|
|
metavar="INT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-f",
|
|
|
|
"--input-format",
|
|
|
|
dest="input_format",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Input audio file format. If not given, guess format from "
|
|
|
|
"extension. If output file name has no extension, guess format "
|
|
|
|
"from file header (requires pydub). If none of the previous is "
|
|
|
|
"true, raise an error",
|
|
|
|
metavar="STRING",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-M",
|
|
|
|
"--max-read",
|
|
|
|
dest="max_read",
|
|
|
|
type=float,
|
|
|
|
default=None,
|
|
|
|
help="Maximum data (in seconds) to read from microphone or file "
|
|
|
|
"[default: read until the end of file/stream]",
|
|
|
|
metavar="FLOAT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-L",
|
|
|
|
"--large-file",
|
|
|
|
dest="large_file",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Whether input file should be treated as a large file. "
|
|
|
|
"If True, data will be read from file on demand, otherwise all "
|
|
|
|
"audio data is loaded to memory before tokenization.",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-O",
|
|
|
|
"--save-stream",
|
|
|
|
dest="save_stream",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Save acquired audio data (from file or microphone) to disk."
|
|
|
|
" If omitted no data will be saved. [default: omitted]",
|
|
|
|
metavar="FILE",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-o",
|
|
|
|
"--save-detections-as",
|
|
|
|
dest="save_detections_as",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="File name format for detections."
|
|
|
|
"The following placeholders can be used to build output file name "
|
|
|
|
"for each detection: {id} (sequential, starts from 1), {start}, "
|
|
|
|
"{end} and {duration}. Time placeholders are in seconds. "
|
|
|
|
"Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'",
|
|
|
|
metavar="STRING",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-T",
|
|
|
|
"--output-format",
|
|
|
|
dest="output_format",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Audio format used to save detections and/or main stream. "
|
|
|
|
"If not supplied, then it will: (1. be guessed from extension or "
|
|
|
|
"(2. use raw format",
|
|
|
|
metavar="STRING",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-u",
|
|
|
|
"--use-channel",
|
|
|
|
dest="use_channel",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Which channel to use for tokenization when input stream is "
|
|
|
|
"multi-channel (0 is the first channel). Default is None, meaning "
|
|
|
|
"that all channels will be considered for tokenization (i.e., get "
|
|
|
|
"any valid audio event regardless of the channel it occurs in). "
|
|
|
|
"This value can also be 'mix' (alias 'avg' or 'average') and "
|
|
|
|
"means mix down all audio channels into one channel (i.e. compute "
|
|
|
|
"average channel) and use the resulting channel for tokenization. "
|
|
|
|
"Whatever option is used, saved audio events will contain the same"
|
|
|
|
" number of channels as input stream. "
|
|
|
|
"[Default: None, use all channels]",
|
|
|
|
metavar="INT/STRING",
|
|
|
|
)
|
|
|
|
|
|
|
|
group = parser.add_argument_group(
|
|
|
|
"Tokenization options", "Set tokenizer options."
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-a",
|
|
|
|
"--analysis-window",
|
|
|
|
dest="analysis_window",
|
|
|
|
default=0.01,
|
|
|
|
type=float,
|
|
|
|
help="Size of analysis window in seconds [default: %(default)s "
|
|
|
|
"(10ms)]",
|
|
|
|
metavar="FLOAT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-n",
|
|
|
|
"--min-duration",
|
|
|
|
dest="min_duration",
|
|
|
|
type=float,
|
|
|
|
default=0.2,
|
|
|
|
help="Min duration of a valid audio event in seconds "
|
|
|
|
"[default: %(default)s]",
|
|
|
|
metavar="FLOAT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-m",
|
|
|
|
"--max-duration",
|
|
|
|
dest="max_duration",
|
|
|
|
type=float,
|
|
|
|
default=5,
|
|
|
|
help="Max duration of a valid audio event in seconds "
|
|
|
|
"[default: %(default)s]",
|
|
|
|
metavar="FLOAT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-s",
|
|
|
|
"--max-silence",
|
|
|
|
dest="max_silence",
|
|
|
|
type=float,
|
|
|
|
default=0.3,
|
|
|
|
help="Max duration of a consecutive silence within a valid audio "
|
|
|
|
"event in seconds [default: %(default)s]",
|
|
|
|
metavar="FLOAT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-d",
|
|
|
|
"--drop-trailing-silence",
|
|
|
|
dest="drop_trailing_silence",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Drop trailing silence from a detection [default: keep "
|
|
|
|
"trailing silence]",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-R",
|
|
|
|
"--strict-min-duration",
|
|
|
|
dest="strict_min_duration",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Reject an event shorter than --min-duration even if it's "
|
|
|
|
"adjacent to the latest valid event that reached max-duration "
|
|
|
|
"[default: keep such events]",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-e",
|
|
|
|
"--energy-threshold",
|
|
|
|
dest="energy_threshold",
|
|
|
|
type=float,
|
|
|
|
default=50,
|
|
|
|
help="Log energy threshold for detection [default: %(default)s]",
|
|
|
|
metavar="FLOAT",
|
|
|
|
)
|
|
|
|
|
|
|
|
group = parser.add_argument_group(
|
|
|
|
"Audio parameters",
|
|
|
|
"Define audio parameters if data is read from a "
|
|
|
|
"headerless file (raw or stdin) or you want to use "
|
|
|
|
"different microphone parameters.",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-r",
|
|
|
|
"--rate",
|
|
|
|
dest="sampling_rate",
|
|
|
|
type=int,
|
|
|
|
default=16000,
|
|
|
|
help="Sampling rate of audio data [default: %(default)s]",
|
|
|
|
metavar="INT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-c",
|
|
|
|
"--channels",
|
|
|
|
dest="channels",
|
|
|
|
type=int,
|
|
|
|
default=1,
|
|
|
|
help="Number of channels of audio data [default: %(default)s]",
|
|
|
|
metavar="INT",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-w",
|
|
|
|
"--width",
|
|
|
|
dest="sample_width",
|
|
|
|
type=int,
|
|
|
|
default=2,
|
|
|
|
help="Number of bytes per audio sample [default: %(default)s]",
|
|
|
|
metavar="INT",
|
|
|
|
)
|
|
|
|
|
|
|
|
group = parser.add_argument_group(
|
|
|
|
"Do something with audio events",
|
|
|
|
"Use these options to print, play back or plot detections.",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-C",
|
|
|
|
"--command",
|
|
|
|
dest="command",
|
|
|
|
type=str,
|
|
|
|
help="Command to call when an audio detection occurs. Use '{file}' "
|
|
|
|
"as a placeholder for the temporary wav file that will contain "
|
|
|
|
"event's data (e.g., \"-C 'du -h {file}'\" to print out file size "
|
|
|
|
" or \"-C 'play -q {file}'\" to play audio with sox)",
|
|
|
|
metavar="STRING",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-E",
|
|
|
|
"--echo",
|
|
|
|
dest="echo",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Play back each detection immediately using pyaudio",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-B",
|
|
|
|
"--progress-bar",
|
|
|
|
dest="progress_bar",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Show a progress bar when playing audio",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"-p",
|
|
|
|
"--plot",
|
|
|
|
dest="plot",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Plot and show audio signal and detections (requires "
|
|
|
|
"matplotlib)",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"--save-image",
|
|
|
|
dest="save_image",
|
|
|
|
type=str,
|
|
|
|
help="Save plotted audio signal and detections as a picture or a "
|
|
|
|
"PDF file (requires matplotlib)",
|
|
|
|
metavar="FILE",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"--printf",
|
|
|
|
dest="printf",
|
|
|
|
type=str,
|
|
|
|
default="{id} {start} {end}",
|
|
|
|
help="Print audio events information, one per line, using this "
|
|
|
|
"format. Format can contain text with the following placeholders: "
|
|
|
|
"{id} (sequential, starts from 1), {start}, {end}, {duration} and "
|
|
|
|
"{timestamp}. The first 3 time placeholders are in seconds and "
|
|
|
|
"their format can be set using --time-format argument. "
|
|
|
|
"{timestamp} is the system timestamp (date and time) of the event "
|
|
|
|
"and can be set using --timestamp-format argument.\n"
|
|
|
|
"Example: '[{id}]: {start} -> {end} -- {timestamp}'",
|
|
|
|
metavar="STRING",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"--time-format",
|
|
|
|
dest="time_format",
|
|
|
|
type=str,
|
|
|
|
default="%S",
|
|
|
|
help="Format used to print {start}, {end} and {duration} "
|
|
|
|
"placeholders used with --printf [default= %(default)s]. The "
|
|
|
|
"following formats are accepted:\n"
|
|
|
|
"%%S: absolute time in seconds. %%I: absolute time in ms. If at "
|
|
|
|
"least one of (%%h, %%m, %%s, %%i) is used, convert time into "
|
|
|
|
"hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only "
|
|
|
|
"supplied fields are printed. Note that %%S and %%I can only be "
|
|
|
|
"used alone",
|
|
|
|
metavar="STRING",
|
|
|
|
)
|
|
|
|
group.add_argument(
|
|
|
|
"--timestamp-format",
|
|
|
|
dest="timestamp_format",
|
|
|
|
type=str,
|
|
|
|
default="%Y/%m/%d %H:%M:%S",
|
|
|
|
help="Format used to print {timestamp}. Should be a format "
|
|
|
|
"accepted by 'datetime' standard module. Default: "
|
|
|
|
"'%%Y/%%m/%%d %%H:%%M:%%S'",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"-q",
|
|
|
|
"--quiet",
|
|
|
|
dest="quiet",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Do not print any information about detections [default: "
|
|
|
|
"print 'id', 'start' and 'end' of each detection]",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"-D",
|
|
|
|
"--debug",
|
|
|
|
dest="debug",
|
|
|
|
action="store_true",
|
|
|
|
default=False,
|
|
|
|
help="Print processing operations to STDOUT",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--debug-file",
|
|
|
|
dest="debug_file",
|
|
|
|
type=str,
|
|
|
|
default=None,
|
|
|
|
help="Print processing operations to FILE",
|
|
|
|
metavar="FILE",
|
|
|
|
)
|
|
|
|
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
logger = make_logger(args.debug, args.debug_file)
|
|
|
|
kwargs = make_kwargs(args)
|
|
|
|
reader, observers = initialize_workers(
|
|
|
|
logger=logger, **kwargs.io, **kwargs.miscellaneous
|
|
|
|
)
|
|
|
|
tokenizer_worker = workers.TokenizerWorker(
|
|
|
|
reader, observers, logger=logger, **kwargs.split
|
|
|
|
)
|
|
|
|
tokenizer_worker.start_all()
|
|
|
|
|
|
|
|
while True:
|
|
|
|
time.sleep(1)
|
|
|
|
if len(threading.enumerate()) == 1:
|
|
|
|
raise EndOfProcessing
|
|
|
|
|
|
|
|
except (KeyboardInterrupt, EndOfProcessing):
|
|
|
|
if tokenizer_worker is not None:
|
|
|
|
tokenizer_worker.stop_all()
|
|
|
|
|
|
|
|
if isinstance(reader, workers.StreamSaverWorker):
|
|
|
|
reader.join()
|
|
|
|
try:
|
|
|
|
reader.save_stream()
|
|
|
|
except AudioEncodingWarning as ae_warn:
|
|
|
|
print(str(ae_warn), file=sys.stderr)
|
|
|
|
|
|
|
|
if args.plot or args.save_image is not None:
|
|
|
|
from .plotting import plot
|
|
|
|
|
|
|
|
reader.rewind()
|
|
|
|
record = AudioRegion(
|
|
|
|
reader.data, reader.sr, reader.sw, reader.ch
|
|
|
|
)
|
|
|
|
detections = (
|
|
|
|
(det.start, det.end) for det in tokenizer_worker.detections
|
|
|
|
)
|
|
|
|
plot(
|
|
|
|
record,
|
|
|
|
detections=detections,
|
|
|
|
energy_threshold=args.energy_threshold,
|
|
|
|
show=True,
|
|
|
|
save_as=args.save_image,
|
|
|
|
)
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
sys.exit(main(None))
|