From c23ce4a4d1929f9b007b64e81f21fcf881e041fb Mon Sep 17 00:00:00 2001 From: morpheus65535 Date: Wed, 23 Feb 2022 22:01:11 -0500 Subject: [PATCH] Reverted auditok to ffsubsync supported version --- libs/auditok/__init__.py | 10 +- libs/auditok/cmdline.py | 1155 ++++++++++++++-------- libs/auditok/cmdline_util.py | 126 --- libs/auditok/core.py | 1656 +++++++------------------------- libs/auditok/dataset.py | 24 +- libs/auditok/exceptions.py | 42 +- libs/auditok/io.py | 1264 ++++++++----------------- libs/auditok/plotting.py | 150 --- libs/auditok/signal.py | 179 ---- libs/auditok/signal_numpy.py | 30 - libs/auditok/util.py | 1734 +++++++++++++++------------------- libs/auditok/workers.py | 427 --------- libs/version.txt | 2 +- 13 files changed, 2226 insertions(+), 4573 deletions(-) delete mode 100755 libs/auditok/cmdline_util.py delete mode 100755 libs/auditok/plotting.py delete mode 100644 libs/auditok/signal.py delete mode 100644 libs/auditok/signal_numpy.py delete mode 100755 libs/auditok/workers.py diff --git a/libs/auditok/__init__.py b/libs/auditok/__init__.py index edd336cc3..4ea697b77 100644 --- a/libs/auditok/__init__.py +++ b/libs/auditok/__init__.py @@ -2,16 +2,20 @@ :author: Amine SEHILI -2015-2021 +2015-2016 :License: -This package is published under the MIT license. +This package is published under GNU GPL Version 3. """ +from __future__ import absolute_import from .core import * from .io import * from .util import * +from . import dataset from .exceptions import * -__version__ = "0.2.0" +__version__ = "0.1.5" + + diff --git a/libs/auditok/cmdline.py b/libs/auditok/cmdline.py index 7e7450762..2f830fbe2 100755 --- a/libs/auditok/cmdline.py +++ b/libs/auditok/cmdline.py @@ -1,428 +1,789 @@ #!/usr/bin/env python # encoding: utf-8 -""" -`auditok` -- An Audio Activity Detection tool +''' +auditok.auditok -- Audio Activity Detection tool + +auditok.auditok is a program that can be used for Audio/Acoustic activity detection. +It can read audio data from audio files as well as from built-in device(s) or standard input -`auditok` is a program that can be used for Audio/Acoustic -activity detection. It can read audio data from audio files as well -as from the microphone or standard input. @author: Mohamed El Amine SEHILI -@copyright: 2015-2021 Mohamed El Amine SEHILI -@license: MIT + +@copyright: 2015 Mohamed El Amine SEHILI + +@license: GPL v3 + @contact: amine.sehili@gmail.com -@deffield updated: 01 Mar 2021 -""" +@deffield updated: 02 Dec 2015 +''' import sys import os -from argparse import ArgumentParser + +from optparse import OptionParser, OptionGroup +from threading import Thread +import tempfile +import wave import time import threading +import logging -from auditok import __version__, AudioRegion -from .util import AudioDataSource -from .exceptions import EndOfProcessing, AudioEncodingWarning -from .io import player_for -from .cmdline_util import make_logger, make_kwargs, initialize_workers -from . import workers +try: + import future + from queue import Queue, Empty +except ImportError: + if sys.version_info >= (3, 0): + from queue import Queue, Empty + else: + from Queue import Queue, Empty +try: + from pydub import AudioSegment + WITH_PYDUB = True +except ImportError: + WITH_PYDUB = False + + +from .core import StreamTokenizer +from .io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for +from .util import ADSFactory, AudioEnergyValidator +from auditok import __version__ as version __all__ = [] -__date__ = "2015-11-23" -__updated__ = "2021-03-01" +__version__ = version +__date__ = '2015-11-23' +__updated__ = '2015-12-02' + +DEBUG = 0 +TESTRUN = 1 +PROFILE = 0 + +LOGGER_NAME = "AUDITOK_LOGGER" + +class AudioFileFormatError(Exception): + pass + +class TimeFormatError(Exception): + pass + +def file_to_audio_source(filename, filetype=None, **kwargs): + + lower_fname = filename.lower() + rawdata = False + + if filetype is not None: + filetype = filetype.lower() + + if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): + + srate = kwargs.pop("sampling_rate", None) + if srate is None: + srate = kwargs.pop("sr", None) + + swidth = kwargs.pop("sample_width", None) + if swidth is None: + swidth = kwargs.pop("sw", None) + + ch = kwargs.pop("channels", None) + if ch is None: + ch = kwargs.pop("ch", None) + + if None in (swidth, srate, ch): + raise Exception("All audio parameters are required for raw data") + + data = open(filename).read() + rawdata = True + + # try first with pydub + if WITH_PYDUB: + + use_channel = kwargs.pop("use_channel", None) + if use_channel is None: + use_channel = kwargs.pop("uc", None) + + if use_channel is None: + use_channel = 1 + else: + try: + use_channel = int(use_channel) + except ValueError: + pass + + if not isinstance(use_channel, (int)) and not use_channel.lower() in ["left", "right", "mix"] : + raise ValueError("channel must be an integer or one of 'left', 'right' or 'mix'") + + asegment = None + + if rawdata: + asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) + if filetype in("wave", "wav") or (filetype is None and lower_fname.endswith(".wav")): + asegment = AudioSegment.from_wav(filename) + elif filetype == "mp3" or (filetype is None and lower_fname.endswith(".mp3")): + asegment = AudioSegment.from_mp3(filename) + elif filetype == "ogg" or (filetype is None and lower_fname.endswith(".ogg")): + asegment = AudioSegment.from_ogg(filename) + elif filetype == "flv" or (filetype is None and lower_fname.endswith(".flv")): + asegment = AudioSegment.from_flv(filename) + else: + asegment = AudioSegment.from_file(filename) + + if asegment.channels > 1: + + if isinstance(use_channel, int): + if use_channel > asegment.channels: + raise ValueError("Can not use channel '{0}', audio file has only {1} channels".format(use_channel, asegment.channels)) + else: + asegment = asegment.split_to_mono()[use_channel - 1] + else: + ch_lower = use_channel.lower() + + if ch_lower == "mix": + asegment = asegment.set_channels(1) + + elif use_channel.lower() == "left": + asegment = asegment.split_to_mono()[0] + + elif use_channel.lower() == "right": + asegment = asegment.split_to_mono()[1] + + return BufferAudioSource(data_buffer = asegment._data, + sampling_rate = asegment.frame_rate, + sample_width = asegment.sample_width, + channels = asegment.channels) + # fall back to standard python + else: + if rawdata: + if ch != 1: + raise ValueError("Cannot handle multi-channel audio without pydub") + return BufferAudioSource(data, srate, swidth, ch) + + if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): + + wfp = wave.open(filename) + + ch = wfp.getnchannels() + if ch != 1: + wfp.close() + raise ValueError("Cannot handle multi-channel audio without pydub") + + srate = wfp.getframerate() + swidth = wfp.getsampwidth() + data = wfp.readframes(wfp.getnframes()) + wfp.close() + return BufferAudioSource(data, srate, swidth, ch) + + raise AudioFileFormatError("Cannot read audio file format") + + +def save_audio_data(data, filename, filetype=None, **kwargs): + + lower_fname = filename.lower() + if filetype is not None: + filetype = filetype.lower() + + # save raw data + if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): + fp = open(filename, "w") + fp.write(data) + fp.close() + return + + # save other types of data + # requires all audio parameters + srate = kwargs.pop("sampling_rate", None) + if srate is None: + srate = kwargs.pop("sr", None) + + swidth = kwargs.pop("sample_width", None) + if swidth is None: + swidth = kwargs.pop("sw", None) + + ch = kwargs.pop("channels", None) + if ch is None: + ch = kwargs.pop("ch", None) + + if None in (swidth, srate, ch): + raise Exception("All audio parameters are required to save no raw data") + + if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): + # use standard python's wave module + fp = wave.open(filename, "w") + fp.setnchannels(ch) + fp.setsampwidth(swidth) + fp.setframerate(srate) + fp.writeframes(data) + fp.close() + + elif WITH_PYDUB: + + asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) + asegment.export(filename, format=filetype) + + else: + raise AudioFileFormatError("cannot write file format {0} (file name: {1})".format(filetype, filename)) + + +def plot_all(signal, sampling_rate, energy_as_amp, detections=[], show=True, save_as=None): + + import matplotlib.pyplot as plt + import numpy as np + t = np.arange(0., np.ceil(float(len(signal))) / sampling_rate, 1./sampling_rate ) + if len(t) > len(signal): + t = t[: len(signal) - len(t)] + + for start, end in detections: + p = plt.axvspan(start, end, facecolor='g', ec = 'r', lw = 2, alpha=0.4) + + line = plt.axhline(y=energy_as_amp, lw=1, ls="--", c="r", label="Energy threshold as normalized amplitude") + plt.plot(t, signal) + legend = plt.legend(["Detection threshold"], bbox_to_anchor=(0., 1.02, 1., .102), loc=1, fontsize=16) + ax = plt.gca().add_artist(legend) + + plt.xlabel("Time (s)", fontsize=24) + plt.ylabel("Amplitude (normalized)", fontsize=24) + + if save_as is not None: + plt.savefig(save_as, dpi=120) + + if show: + plt.show() + + +def seconds_to_str_fromatter(_format): + """ + Accepted format directives: %i %s %m %h + """ + # check directives are correct + + if _format == "%S": + def _fromatter(seconds): + return "{:.2f}".format(seconds) + + elif _format == "%I": + def _fromatter(seconds): + return "{0}".format(int(seconds * 1000)) + + else: + _format = _format.replace("%h", "{hrs:02d}") + _format = _format.replace("%m", "{mins:02d}") + _format = _format.replace("%s", "{secs:02d}") + _format = _format.replace("%i", "{millis:03d}") + + try: + i = _format.index("%") + raise TimeFormatError("Unknow time format directive '{0}'".format(_format[i:i+2])) + except ValueError: + pass + + def _fromatter(seconds): + millis = int(seconds * 1000) + hrs, millis = divmod(millis, 3600000) + mins, millis = divmod(millis, 60000) + secs, millis = divmod(millis, 1000) + return _format.format(hrs=hrs, mins=mins, secs=secs, millis=millis) + + return _fromatter + + + +class Worker(Thread): + + def __init__(self, timeout=0.2, debug=False, logger=None): + self.timeout = timeout + self.debug = debug + self.logger = logger + + if self.debug and self.logger is None: + self.logger = logging.getLogger(LOGGER_NAME) + self.logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + self.logger.addHandler(handler) + + self._inbox = Queue() + self._stop_request = Queue() + Thread.__init__(self) + + + def debug_message(self, message): + self.logger.debug(message) + + def _stop_requested(self): + + try: + message = self._stop_request.get_nowait() + if message == "stop": + return True + + except Empty: + return False + + def stop(self): + self._stop_request.put("stop") + self.join() + + def send(self, message): + self._inbox.put(message) + + def _get_message(self): + try: + message = self._inbox.get(timeout=self.timeout) + return message + except Empty: + return None + + +class TokenizerWorker(Worker): + + END_OF_PROCESSING = "END_OF_PROCESSING" + + def __init__(self, ads, tokenizer, analysis_window, observers): + self.ads = ads + self.tokenizer = tokenizer + self.analysis_window = analysis_window + self.observers = observers + self._inbox = Queue() + self.count = 0 + Worker.__init__(self) + + def run(self): + + def notify_observers(data, start, end): + audio_data = b''.join(data) + self.count += 1 + + start_time = start * self.analysis_window + end_time = (end+1) * self.analysis_window + duration = (end - start + 1) * self.analysis_window + + # notify observers + for observer in self.observers: + observer.notify({"id" : self.count, + "audio_data" : audio_data, + "start" : start, + "end" : end, + "start_time" : start_time, + "end_time" : end_time, + "duration" : duration} + ) + + self.ads.open() + self.tokenizer.tokenize(data_source=self, callback=notify_observers) + for observer in self.observers: + observer.notify(TokenizerWorker.END_OF_PROCESSING) + + def add_observer(self, observer): + self.observers.append(observer) + + def remove_observer(self, observer): + self.observers.remove(observer) + + def read(self): + if self._stop_requested(): + return None + else: + return self.ads.read() + + +class PlayerWorker(Worker): + + def __init__(self, player, timeout=0.2, debug=False, logger=None): + self.player = player + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + dur = message.pop("duration", None) + _id = message.pop("id", None) + + if audio_data is not None: + if self.debug: + self.debug_message("[PLAY]: Detection {id} played (start:{start}, end:{end}, dur:{dur})".format(id=_id, + start="{:5.2f}".format(start_time), end="{:5.2f}".format(end_time), dur="{:5.2f}".format(dur))) + self.player.play(audio_data) + + def notify(self, message): + self.send(message) + + +class CommandLineWorker(Worker): + + def __init__(self, command, timeout=0.2, debug=False, logger=None): + self.command = command + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + _id = message.pop("id", None) + if audio_data is not None: + raw_audio_file = tempfile.NamedTemporaryFile(delete=False) + raw_audio_file.write(audio_data) + cmd = self.command.replace("$", raw_audio_file.name) + if self.debug: + self.debug_message("[CMD ]: Detection {id} command: {cmd}".format(id=_id, cmd=cmd)) + os.system(cmd) + os.unlink(raw_audio_file.name) + + def notify(self, message): + self.send(message) + + +class TokenSaverWorker(Worker): + + def __init__(self, name_format, filetype, timeout=0.2, debug=False, logger=None, **kwargs): + self.name_format = name_format + self.filetype = filetype + self.kwargs = kwargs + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + _id = message.pop("id", None) + if audio_data is not None and len(audio_data) > 0: + fname = self.name_format.format(N=_id, start = "{:.2f}".format(start_time), end = "{:.2f}".format(end_time)) + try: + if self.debug: + self.debug_message("[SAVE]: Detection {id} saved as {fname}".format(id=_id, fname=fname)) + save_audio_data(audio_data, fname, filetype=self.filetype, **self.kwargs) + except Exception as e: + sys.stderr.write(str(e) + "\n") + + def notify(self, message): + self.send(message) + + +class LogWorker(Worker): + + def __init__(self, print_detections=False, output_format="{start} {end}", + time_formatter=seconds_to_str_fromatter("%S"), timeout=0.2, debug=False, logger=None): + + self.print_detections = print_detections + self.output_format = output_format + self.time_formatter = time_formatter + self.detections = [] + Worker.__init__(self, timeout=timeout, debug=debug, logger=logger) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + + if message is not None: + + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + _id = message.pop("id", None) + start = message.pop("start", None) + end = message.pop("end", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + if audio_data is not None and len(audio_data) > 0: + + if self.debug: + self.debug_message("[DET ]: Detection {id} (start:{start}, end:{end})".format(id=_id, + start="{:5.2f}".format(start_time), + end="{:5.2f}".format(end_time))) + + if self.print_detections: + print(self.output_format.format(id = _id, + start = self.time_formatter(start_time), + end = self.time_formatter(end_time))) + + self.detections.append((_id, start, end, start_time, end_time)) + + + def notify(self, message): + self.send(message) + def main(argv=None): + '''Command line options.''' + program_name = os.path.basename(sys.argv[0]) + program_version = version + program_build_date = "%s" % __updated__ + + program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) + #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse + program_longdesc = '''''' # optional - give further explanation about what the program does + program_license = "Copyright 2015 Mohamed El Amine SEHILI \ + Licensed under the General Public License (GPL) Version 3 \nhttp://www.gnu.org/licenses/" + if argv is None: argv = sys.argv[1:] try: - parser = ArgumentParser( - prog=program_name, description="An Audio Tokenization tool" - ) - parser.add_argument( - "--version", "-v", action="version", version=__version__ - ) - group = parser.add_argument_group("Input-Output options") - group.add_argument( - dest="input", - help="Input audio or video file. Use '-' for stdin " - "[default: read from microphone using pyaudio]", - metavar="input", - nargs="?", - default=None, - ) - group.add_argument( - "-I", - "--input-device-index", - dest="input_device_index", - help="Audio device index [default: %(default)s]. " - "Optional and only effective when using PyAudio", - type=int, - default=None, - metavar="INT", - ) - group.add_argument( - "-F", - "--audio-frame-per-buffer", - dest="frame_per_buffer", - help="Audio frame per buffer [default: %(default)s]. " - "Optional and only effective when using PyAudio", - type=int, - default=1024, - metavar="INT", - ) - group.add_argument( - "-f", - "--input-format", - dest="input_format", - type=str, - default=None, - help="Input audio file format. If not given, guess format from " - "extension. If output file name has no extension, guess format " - "from file header (requires pydub). If none of the previous is " - "true, raise an error", - metavar="STRING", - ) - group.add_argument( - "-M", - "--max-read", - dest="max_read", - type=float, - default=None, - help="Maximum data (in seconds) to read from microphone or file " - "[default: read until the end of file/stream]", - metavar="FLOAT", - ) - group.add_argument( - "-L", - "--large-file", - dest="large_file", - action="store_true", - default=False, - help="Whether input file should be treated as a large file. " - "If True, data will be read from file on demand, otherwise all " - "audio data is loaded to memory before tokenization.", - ) - group.add_argument( - "-O", - "--save-stream", - dest="save_stream", - type=str, - default=None, - help="Save acquired audio data (from file or microphone) to disk." - " If omitted no data will be saved. [default: omitted]", - metavar="FILE", - ) - group.add_argument( - "-o", - "--save-detections-as", - dest="save_detections_as", - type=str, - default=None, - help="File name format for detections." - "The following placeholders can be used to build output file name " - "for each detection: {id} (sequential, starts from 1), {start}, " - "{end} and {duration}. Time placeholders are in seconds. " - "Example: 'Event_{id}_{start}-{end}_{duration:.3f}.wav'", - metavar="STRING", - ) - group.add_argument( - "-T", - "--output-format", - dest="output_format", - type=str, - default=None, - help="Audio format used to save detections and/or main stream. " - "If not supplied, then it will: (1. be guessed from extension or " - "(2. use raw format", - metavar="STRING", - ) - group.add_argument( - "-u", - "--use-channel", - dest="use_channel", - type=str, - default=None, - help="Which channel to use for tokenization when input stream is " - "multi-channel (0 is the first channel). Default is None, meaning " - "that all channels will be considered for tokenization (i.e., get " - "any valid audio event regardless of the channel it occurs in). " - "This value can also be 'mix' (alias 'avg' or 'average') and " - "means mix down all audio channels into one channel (i.e. compute " - "average channel) and use the resulting channel for tokenization. " - "Whatever option is used, saved audio events will contain the same" - " number of channels as input stream. " - "[Default: None, use all channels]", - metavar="INT/STRING", - ) - - group = parser.add_argument_group( - "Tokenization options", "Set tokenizer options." - ) - group.add_argument( - "-a", - "--analysis-window", - dest="analysis_window", - default=0.01, - type=float, - help="Size of analysis window in seconds [default: %(default)s " - "(10ms)]", - metavar="FLOAT", - ) - group.add_argument( - "-n", - "--min-duration", - dest="min_duration", - type=float, - default=0.2, - help="Min duration of a valid audio event in seconds " - "[default: %(default)s]", - metavar="FLOAT", - ) - group.add_argument( - "-m", - "--max-duration", - dest="max_duration", - type=float, - default=5, - help="Max duration of a valid audio event in seconds " - "[default: %(default)s]", - metavar="FLOAT", - ) - group.add_argument( - "-s", - "--max-silence", - dest="max_silence", - type=float, - default=0.3, - help="Max duration of a consecutive silence within a valid audio " - "event in seconds [default: %(default)s]", - metavar="FLOAT", - ) - group.add_argument( - "-d", - "--drop-trailing-silence", - dest="drop_trailing_silence", - action="store_true", - default=False, - help="Drop trailing silence from a detection [default: keep " - "trailing silence]", - ) - group.add_argument( - "-R", - "--strict-min-duration", - dest="strict_min_duration", - action="store_true", - default=False, - help="Reject an event shorter than --min-duration even if it's " - "adjacent to the latest valid event that reached max-duration " - "[default: keep such events]", - ) - group.add_argument( - "-e", - "--energy-threshold", - dest="energy_threshold", - type=float, - default=50, - help="Log energy threshold for detection [default: %(default)s]", - metavar="FLOAT", - ) - - group = parser.add_argument_group( - "Audio parameters", - "Define audio parameters if data is read from a " - "headerless file (raw or stdin) or you want to use " - "different microphone parameters.", - ) - group.add_argument( - "-r", - "--rate", - dest="sampling_rate", - type=int, - default=16000, - help="Sampling rate of audio data [default: %(default)s]", - metavar="INT", - ) - group.add_argument( - "-c", - "--channels", - dest="channels", - type=int, - default=1, - help="Number of channels of audio data [default: %(default)s]", - metavar="INT", - ) - group.add_argument( - "-w", - "--width", - dest="sample_width", - type=int, - default=2, - help="Number of bytes per audio sample [default: %(default)s]", - metavar="INT", - ) - - group = parser.add_argument_group( - "Do something with audio events", - "Use these options to print, play back or plot detections.", - ) - group.add_argument( - "-C", - "--command", - dest="command", - type=str, - help="Command to call when an audio detection occurs. Use '{file}' " - "as a placeholder for the temporary wav file that will contain " - "event's data (e.g., \"-C 'du -h {file}'\" to print out file size " - " or \"-C 'play -q {file}'\" to play audio with sox)", - metavar="STRING", - ) - group.add_argument( - "-E", - "--echo", - dest="echo", - action="store_true", - default=False, - help="Play back each detection immediately using pyaudio", - ) - group.add_argument( - "-B", - "--progress-bar", - dest="progress_bar", - action="store_true", - default=False, - help="Show a progress bar when playing audio", - ) - group.add_argument( - "-p", - "--plot", - dest="plot", - action="store_true", - default=False, - help="Plot and show audio signal and detections (requires " - "matplotlib)", - ) - group.add_argument( - "--save-image", - dest="save_image", - type=str, - help="Save plotted audio signal and detections as a picture or a " - "PDF file (requires matplotlib)", - metavar="FILE", - ) - group.add_argument( - "--printf", - dest="printf", - type=str, - default="{id} {start} {end}", - help="Print audio events information, one per line, using this " - "format. Format can contain text with the following placeholders: " - "{id} (sequential, starts from 1), {start}, {end}, {duration} and " - "{timestamp}. The first 3 time placeholders are in seconds and " - "their format can be set using --time-format argument. " - "{timestamp} is the system timestamp (date and time) of the event " - "and can be set using --timestamp-format argument.\n" - "Example: '[{id}]: {start} -> {end} -- {timestamp}'", - metavar="STRING", - ) - group.add_argument( - "--time-format", - dest="time_format", - type=str, - default="%S", - help="Format used to print {start}, {end} and {duration} " - "placeholders used with --printf [default= %(default)s]. The " - "following formats are accepted:\n" - "%%S: absolute time in seconds. %%I: absolute time in ms. If at " - "least one of (%%h, %%m, %%s, %%i) is used, convert time into " - "hours, minutes, seconds and millis (e.g. %%h:%%m:%%s.%%i). Only " - "supplied fields are printed. Note that %%S and %%I can only be " - "used alone", - metavar="STRING", - ) - group.add_argument( - "--timestamp-format", - dest="timestamp_format", - type=str, - default="%Y/%m/%d %H:%M:%S", - help="Format used to print {timestamp}. Should be a format " - "accepted by 'datetime' standard module. Default: " - "'%%Y/%%m/%%d %%H:%%M:%%S'", - ) - parser.add_argument( - "-q", - "--quiet", - dest="quiet", - action="store_true", - default=False, - help="Do not print any information about detections [default: " - "print 'id', 'start' and 'end' of each detection]", - ) - parser.add_argument( - "-D", - "--debug", - dest="debug", - action="store_true", - default=False, - help="Print processing operations to STDOUT", - ) - parser.add_argument( - "--debug-file", - dest="debug_file", - type=str, - default=None, - help="Print processing operations to FILE", - metavar="FILE", - ) - - args = parser.parse_args(argv) - logger = make_logger(args.debug, args.debug_file) - kwargs = make_kwargs(args) - reader, observers = initialize_workers( - logger=logger, **kwargs.io, **kwargs.miscellaneous - ) - tokenizer_worker = workers.TokenizerWorker( - reader, observers, logger=logger, **kwargs.split - ) - tokenizer_worker.start_all() + # setup option parser + parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) + + group = OptionGroup(parser, "[Input-Output options]") + group.add_option("-i", "--input", dest="input", help="Input audio or video file. Use - for stdin [default: read from microphone using pyaudio]", metavar="FILE") + group.add_option("-t", "--input-type", dest="input_type", help="Input audio file type. Mandatory if file name has no extension [default: %default]", type=str, default=None, metavar="String") + group.add_option("-M", "--max_time", dest="max_time", help="Max data (in seconds) to read from microphone/file [default: read until the end of file/stream]", type=float, default=None, metavar="FLOAT") + group.add_option("-O", "--output-main", dest="output_main", help="Save main stream as. If omitted main stream will not be saved [default: omitted]", type=str, default=None, metavar="FILE") + group.add_option("-o", "--output-tokens", dest="output_tokens", help="Output file name format for detections. Use {N} and {start} and {end} to build file names, example: 'Det_{N}_{start}-{end}.wav'", type=str, default=None, metavar="STRING") + group.add_option("-T", "--output-type", dest="output_type", help="Audio type used to save detections and/or main stream. If not supplied will: (1). guess from extension or (2). use wav format", type=str, default=None, metavar="STRING") + group.add_option("-u", "--use-channel", dest="use_channel", help="Choose channel to use from a multi-channel audio file (requires pydub). 'left', 'right' and 'mix' are accepted values. [Default: 1 (i.e. 1st or left channel)]", type=str, default="1", metavar="STRING") + parser.add_option_group(group) + + + group = OptionGroup(parser, "[Tokenization options]", "Set tokenizer options and energy threshold.") + group.add_option("-a", "--analysis-window", dest="analysis_window", help="Size of analysis window in seconds [default: %default (10ms)]", type=float, default=0.01, metavar="FLOAT") + group.add_option("-n", "--min-duration", dest="min_duration", help="Min duration of a valid audio event in seconds [default: %default]", type=float, default=0.2, metavar="FLOAT") + group.add_option("-m", "--max-duration", dest="max_duration", help="Max duration of a valid audio event in seconds [default: %default]", type=float, default=5, metavar="FLOAT") + group.add_option("-s", "--max-silence", dest="max_silence", help="Max duration of a consecutive silence within a valid audio event in seconds [default: %default]", type=float, default=0.3, metavar="FLOAT") + group.add_option("-d", "--drop-trailing-silence", dest="drop_trailing_silence", help="Drop trailing silence from a detection [default: keep trailing silence]", action="store_true", default=False) + group.add_option("-e", "--energy-threshold", dest="energy_threshold", help="Log energy threshold for detection [default: %default]", type=float, default=50, metavar="FLOAT") + parser.add_option_group(group) + + + group = OptionGroup(parser, "[Audio parameters]", "Define audio parameters if data is read from a headerless file (raw or stdin) or you want to use different microphone parameters.") + group.add_option("-r", "--rate", dest="sampling_rate", help="Sampling rate of audio data [default: %default]", type=int, default=16000, metavar="INT") + group.add_option("-c", "--channels", dest="channels", help="Number of channels of audio data [default: %default]", type=int, default=1, metavar="INT") + group.add_option("-w", "--width", dest="sample_width", help="Number of bytes per audio sample [default: %default]", type=int, default=2, metavar="INT") + parser.add_option_group(group) + + group = OptionGroup(parser, "[Do something with detections]", "Use these options to print, play or plot detections.") + group.add_option("-C", "--command", dest="command", help="Command to call when an audio detection occurs. Use $ to represent the file name to use with the command (e.g. -C 'du -h $')", default=None, type=str, metavar="STRING") + group.add_option("-E", "--echo", dest="echo", help="Play back each detection immediately using pyaudio [default: do not play]", action="store_true", default=False) + group.add_option("-p", "--plot", dest="plot", help="Plot and show audio signal and detections (requires matplotlib)", action="store_true", default=False) + group.add_option("", "--save-image", dest="save_image", help="Save plotted audio signal and detections as a picture or a PDF file (requires matplotlib)", type=str, default=None, metavar="FILE") + group.add_option("", "--printf", dest="printf", help="print detections one per line using a user supplied format (e.g. '[{id}]: {start} -- {end}'). Available keywords {id}, {start} and {end}", type=str, default="{id} {start} {end}", metavar="STRING") + group.add_option("", "--time-format", dest="time_format", help="format used to print {start} and {end}. [Default= %default]. %S: absolute time in sec. %I: absolute time in ms. If at least one of (%h, %m, %s, %i) is used, convert time into hours, minutes, seconds and millis (e.g. %h:%m:%s.%i). Only required fields are printed", type=str, default="%S", metavar="STRING") + parser.add_option_group(group) + + parser.add_option("-q", "--quiet", dest="quiet", help="Do not print any information about detections [default: print 'id', 'start' and 'end' of each detection]", action="store_true", default=False) + parser.add_option("-D", "--debug", dest="debug", help="Print processing operations to STDOUT", action="store_true", default=False) + parser.add_option("", "--debug-file", dest="debug_file", help="Print processing operations to FILE", type=str, default=None, metavar="FILE") + + + # process options + (opts, args) = parser.parse_args(argv) + + if opts.input == "-": + asource = StdinAudioSource(sampling_rate = opts.sampling_rate, + sample_width = opts.sample_width, + channels = opts.channels) + #read data from a file + elif opts.input is not None: + asource = file_to_audio_source(filename=opts.input, filetype=opts.input_type, uc=opts.use_channel) + + # read data from microphone via pyaudio + else: + try: + asource = PyAudioSource(sampling_rate = opts.sampling_rate, + sample_width = opts.sample_width, + channels = opts.channels) + except Exception: + sys.stderr.write("Cannot read data from audio device!\n") + sys.stderr.write("You should either install pyaudio or read data from STDIN\n") + sys.exit(2) + + logger = logging.getLogger(LOGGER_NAME) + logger.setLevel(logging.DEBUG) + + handler = logging.StreamHandler(sys.stdout) + if opts.quiet or not opts.debug: + # only critical messages will be printed + handler.setLevel(logging.CRITICAL) + else: + handler.setLevel(logging.DEBUG) + + logger.addHandler(handler) + + if opts.debug_file is not None: + logger.setLevel(logging.DEBUG) + opts.debug = True + handler = logging.FileHandler(opts.debug_file, "w") + fmt = logging.Formatter('[%(asctime)s] | %(message)s') + handler.setFormatter(fmt) + handler.setLevel(logging.DEBUG) + logger.addHandler(handler) + + record = opts.output_main is not None or opts.plot or opts.save_image is not None + + ads = ADSFactory.ads(audio_source = asource, block_dur = opts.analysis_window, max_time = opts.max_time, record = record) + validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=opts.energy_threshold) + + + if opts.drop_trailing_silence: + mode = StreamTokenizer.DROP_TRAILING_SILENCE + else: + mode = 0 + + analysis_window_per_second = 1. / opts.analysis_window + tokenizer = StreamTokenizer(validator=validator, min_length=opts.min_duration * analysis_window_per_second, + max_length=int(opts.max_duration * analysis_window_per_second), + max_continuous_silence=opts.max_silence * analysis_window_per_second, + mode = mode) + + + observers = [] + tokenizer_worker = None + + if opts.output_tokens is not None: + + try: + # check user format is correct + fname = opts.output_tokens.format(N=0, start=0, end=0) + + # find file type for detections + tok_type = opts.output_type + if tok_type is None: + tok_type = os.path.splitext(opts.output_tokens)[1][1:] + if tok_type == "": + tok_type = "wav" + + token_saver = TokenSaverWorker(name_format=opts.output_tokens, filetype=tok_type, + debug=opts.debug, logger=logger, sr=asource.get_sampling_rate(), + sw=asource.get_sample_width(), + ch=asource.get_channels()) + observers.append(token_saver) + + except Exception: + sys.stderr.write("Wrong format for detections file name: '{0}'\n".format(opts.output_tokens)) + sys.exit(2) + + if opts.echo: + try: + player = player_for(asource) + player_worker = PlayerWorker(player=player, debug=opts.debug, logger=logger) + observers.append(player_worker) + except Exception: + sys.stderr.write("Cannot get an audio player!\n") + sys.stderr.write("You should either install pyaudio or supply a command (-C option) to play audio\n") + sys.exit(2) + + if opts.command is not None and len(opts.command) > 0: + cmd_worker = CommandLineWorker(command=opts.command, debug=opts.debug, logger=logger) + observers.append(cmd_worker) + + if not opts.quiet or opts.plot is not None or opts.save_image is not None: + oformat = opts.printf.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "\r") + converter = seconds_to_str_fromatter(opts.time_format) + log_worker = LogWorker(print_detections = not opts.quiet, output_format=oformat, + time_formatter=converter, logger=logger, debug=opts.debug) + observers.append(log_worker) + + tokenizer_worker = TokenizerWorker(ads, tokenizer, opts.analysis_window, observers) + + def _save_main_stream(): + # find file type + main_type = opts.output_type + if main_type is None: + main_type = os.path.splitext(opts.output_main)[1][1:] + if main_type == "": + main_type = "wav" + ads.close() + ads.rewind() + data = ads.get_audio_source().get_data_buffer() + if len(data) > 0: + save_audio_data(data=data, filename=opts.output_main, filetype=main_type, sr=asource.get_sampling_rate(), + sw = asource.get_sample_width(), + ch = asource.get_channels()) + + def _plot(): + import numpy as np + ads.close() + ads.rewind() + data = ads.get_audio_source().get_data_buffer() + signal = AudioEnergyValidator._convert(data, asource.get_sample_width()) + detections = [(det[3] , det[4]) for det in log_worker.detections] + max_amplitude = 2**(asource.get_sample_width() * 8 - 1) - 1 + energy_as_amp = np.sqrt(np.exp(opts.energy_threshold * np.log(10) / 10)) / max_amplitude + plot_all(signal / max_amplitude, asource.get_sampling_rate(), energy_as_amp, detections, show = opts.plot, save_as = opts.save_image) + + + # start observer threads + for obs in observers: + obs.start() + # start tokenization thread + tokenizer_worker.start() + while True: time.sleep(1) if len(threading.enumerate()) == 1: - raise EndOfProcessing - - except (KeyboardInterrupt, EndOfProcessing): + break + + tokenizer_worker = None + + if opts.output_main is not None: + _save_main_stream() + if opts.plot or opts.save_image is not None: + _plot() + + return 0 + + except KeyboardInterrupt: + if tokenizer_worker is not None: - tokenizer_worker.stop_all() - - if isinstance(reader, workers.StreamSaverWorker): - reader.join() - try: - reader.save_stream() - except AudioEncodingWarning as ae_warn: - print(str(ae_warn), file=sys.stderr) - - if args.plot or args.save_image is not None: - from .plotting import plot - - reader.rewind() - record = AudioRegion( - reader.data, reader.sr, reader.sw, reader.ch - ) - detections = ( - (det.start, det.end) for det in tokenizer_worker.detections - ) - plot( - record, - detections=detections, - energy_threshold=args.energy_threshold, - show=True, - save_as=args.save_image, - ) + tokenizer_worker.stop() + for obs in observers: + obs.stop() + + if opts.output_main is not None: + _save_main_stream() + if opts.plot or opts.save_image is not None: + _plot() + return 0 + except Exception as e: + sys.stderr.write(program_name + ": " + str(e) + "\n") + sys.stderr.write("for help use -h\n") + + return 2 if __name__ == "__main__": - sys.exit(main(None)) + if DEBUG: + sys.argv.append("-h") + if TESTRUN: + import doctest + doctest.testmod() + if PROFILE: + import cProfile + import pstats + profile_filename = 'auditok.auditok_profile.txt' + cProfile.run('main()', profile_filename) + statsfile = open("profile_stats.txt", "wb") + p = pstats.Stats(profile_filename, stream=statsfile) + stats = p.strip_dirs().sort_stats('cumulative') + stats.print_stats() + statsfile.close() + sys.exit(0) + sys.exit(main()) diff --git a/libs/auditok/cmdline_util.py b/libs/auditok/cmdline_util.py deleted file mode 100755 index bde72aa36..000000000 --- a/libs/auditok/cmdline_util.py +++ /dev/null @@ -1,126 +0,0 @@ -import sys -import logging -from collections import namedtuple -from . import workers -from .util import AudioDataSource -from .io import player_for - -_AUDITOK_LOGGER = "AUDITOK_LOGGER" -KeywordArguments = namedtuple( - "KeywordArguments", ["io", "split", "miscellaneous"] -) - - -def make_kwargs(args_ns): - if args_ns.save_stream is None: - record = args_ns.plot or (args_ns.save_image is not None) - else: - record = False - try: - use_channel = int(args_ns.use_channel) - except (ValueError, TypeError): - use_channel = args_ns.use_channel - - io_kwargs = { - "input": args_ns.input, - "audio_format": args_ns.input_format, - "max_read": args_ns.max_read, - "block_dur": args_ns.analysis_window, - "sampling_rate": args_ns.sampling_rate, - "sample_width": args_ns.sample_width, - "channels": args_ns.channels, - "use_channel": use_channel, - "save_stream": args_ns.save_stream, - "save_detections_as": args_ns.save_detections_as, - "export_format": args_ns.output_format, - "large_file": args_ns.large_file, - "frames_per_buffer": args_ns.frame_per_buffer, - "input_device_index": args_ns.input_device_index, - "record": record, - } - - split_kwargs = { - "min_dur": args_ns.min_duration, - "max_dur": args_ns.max_duration, - "max_silence": args_ns.max_silence, - "drop_trailing_silence": args_ns.drop_trailing_silence, - "strict_min_dur": args_ns.strict_min_duration, - "energy_threshold": args_ns.energy_threshold, - } - - miscellaneous = { - "echo": args_ns.echo, - "progress_bar": args_ns.progress_bar, - "command": args_ns.command, - "quiet": args_ns.quiet, - "printf": args_ns.printf, - "time_format": args_ns.time_format, - "timestamp_format": args_ns.timestamp_format, - } - return KeywordArguments(io_kwargs, split_kwargs, miscellaneous) - - -def make_logger(stderr=False, file=None, name=_AUDITOK_LOGGER): - if not stderr and file is None: - return None - logger = logging.getLogger(name) - logger.setLevel(logging.INFO) - if stderr: - handler = logging.StreamHandler(sys.stderr) - handler.setLevel(logging.INFO) - logger.addHandler(handler) - - if file is not None: - handler = logging.FileHandler(file, "w") - fmt = logging.Formatter("[%(asctime)s] | %(message)s") - handler.setFormatter(fmt) - handler.setLevel(logging.INFO) - logger.addHandler(handler) - return logger - - -def initialize_workers(logger=None, **kwargs): - observers = [] - reader = AudioDataSource(source=kwargs["input"], **kwargs) - if kwargs["save_stream"] is not None: - reader = workers.StreamSaverWorker( - reader, - filename=kwargs["save_stream"], - export_format=kwargs["export_format"], - ) - reader.start() - - if kwargs["save_detections_as"] is not None: - worker = workers.RegionSaverWorker( - kwargs["save_detections_as"], - kwargs["export_format"], - logger=logger, - ) - observers.append(worker) - - if kwargs["echo"]: - player = player_for(reader) - worker = workers.PlayerWorker( - player, progress_bar=kwargs["progress_bar"], logger=logger - ) - observers.append(worker) - - if kwargs["command"] is not None: - worker = workers.CommandLineWorker( - command=kwargs["command"], logger=logger - ) - observers.append(worker) - - if not kwargs["quiet"]: - print_format = ( - kwargs["printf"] - .replace("\\n", "\n") - .replace("\\t", "\t") - .replace("\\r", "\r") - ) - worker = workers.PrintWorker( - print_format, kwargs["time_format"], kwargs["timestamp_format"] - ) - observers.append(worker) - - return reader, observers diff --git a/libs/auditok/core.py b/libs/auditok/core.py index af00dc7af..47441d2b7 100644 --- a/libs/auditok/core.py +++ b/libs/auditok/core.py @@ -1,1267 +1,264 @@ """ -.. autosummary:: - :toctree: generated/ - - load - split - AudioRegion - StreamTokenizer -""" -import os -import math -from .util import AudioReader, DataValidator, AudioEnergyValidator -from .io import check_audio_data, to_file, player_for, get_audio_source -from .exceptions import TooSamllBlockDuration - -try: - from . import signal_numpy as signal -except ImportError: - from . import signal - -__all__ = ["load", "split", "AudioRegion", "StreamTokenizer"] - - -DEFAULT_ANALYSIS_WINDOW = 0.05 -DEFAULT_ENERGY_THRESHOLD = 50 -_EPSILON = 1e-10 - - -def load(input, skip=0, max_read=None, **kwargs): - """Load audio data from a source and return it as an :class:`AudioRegion`. - - Parameters - ---------- - input : None, str, bytes, AudioSource - source to read audio data from. If `str`, it should be a path to a - valid audio file. If `bytes`, it is used as raw audio data. If it is - "-", raw data will be read from stdin. If None, read audio data from - the microphone using PyAudio. If of type `bytes` or is a path to a - raw audio file then `sampling_rate`, `sample_width` and `channels` - parameters (or their alias) are required. If it's an - :class:`AudioSource` object it's used directly to read data. - skip : float, default: 0 - amount, in seconds, of audio data to skip from source. If read from - a microphone, `skip` must be 0, otherwise a `ValueError` is raised. - max_read : float, default: None - amount, in seconds, of audio data to read from source. If read from - microphone, `max_read` should not be None, otherwise a `ValueError` is - raised. - audio_format, fmt : str - type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only - be used if `input` is a string path to an audio file. If not given, - audio type will be guessed from file name extension or from file - header. - sampling_rate, sr : int - sampling rate of audio data. Required if `input` is a raw audio file, - a `bytes` object or None (i.e., read from microphone). - sample_width, sw : int - number of bytes used to encode one audio sample, typically 1, 2 or 4. - Required for raw data, see `sampling_rate`. - channels, ch : int - number of channels of audio data. Required for raw data, see - `sampling_rate`. - large_file : bool, default: False - If True, AND if `input` is a path to a *wav* of a *raw* audio file - (and **only** these two formats) then audio file is not fully loaded to - memory in order to create the region (but the portion of data needed to - create the region is of course loaded to memory). Set to True if - `max_read` is significantly smaller then the size of a large audio file - that shouldn't be entirely loaded to memory. - - Returns - ------- - region: AudioRegion - - Raises - ------ - ValueError - raised if `input` is None (i.e., read data from microphone) and `skip` - != 0 or `input` is None `max_read` is None (meaning that when reading - from the microphone, no data should be skipped, and maximum amount of - data to read should be explicitly provided). - """ - return AudioRegion.load(input, skip, max_read, **kwargs) - - -def split( - input, - min_dur=0.2, - max_dur=5, - max_silence=0.3, - drop_trailing_silence=False, - strict_min_dur=False, - **kwargs -): - """ - Split audio data and return a generator of AudioRegions - - Parameters - ---------- - input : str, bytes, AudioSource, AudioReader, AudioRegion or None - input audio data. If str, it should be a path to an existing audio file. - "-" is interpreted as standard input. If bytes, input is considered as - raw audio data. If None, read audio from microphone. - Every object that is not an `AudioReader` will be transformed into an - `AudioReader` before processing. If it is an `str` that refers to a raw - audio file, `bytes` or None, audio parameters should be provided using - kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or their - alias). - If `input` is str then audio format will be guessed from file extension. - `audio_format` (alias `fmt`) kwarg can also be given to specify audio - format explicitly. If none of these options is available, rely on - backend (currently only pydub is supported) to load data. - min_dur : float, default: 0.2 - minimun duration in seconds of a detected audio event. By using large - values for `min_dur`, very short audio events (e.g., very short 1-word - utterances like 'yes' or 'no') can be mis detected. Using very short - values might result in a high number of short, unuseful audio events. - max_dur : float, default: 5 - maximum duration in seconds of a detected audio event. If an audio event - lasts more than `max_dur` it will be truncated. If the continuation of a - truncated audio event is shorter than `min_dur` then this continuation - is accepted as a valid audio event if `strict_min_dur` is False. - Otherwise it is rejected. - max_silence : float, default: 0.3 - maximum duration of continuous silence within an audio event. There - might be many silent gaps of this duration within one audio event. If - the continuous silence happens at the end of the event than it's kept as - part of the event if `drop_trailing_silence` is False (default). - drop_trailing_silence : bool, default: False - Whether to remove trailing silence from detected events. To avoid abrupt - cuts in speech, trailing silence should be kept, therefore this - parameter should be False. - strict_min_dur : bool, default: False - strict minimum duration. Do not accept an audio event if it is shorter - than `min_dur` even if it is contiguous to the latest valid event. This - happens if the the latest detected event had reached `max_dur`. - - Other Parameters - ---------------- - analysis_window, aw : float, default: 0.05 (50 ms) - duration of analysis window in seconds. A value between 0.01 (10 ms) and - 0.1 (100 ms) should be good for most use-cases. - audio_format, fmt : str - type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be - used if `input` is a string path to an audio file. If not given, audio - type will be guessed from file name extension or from file header. - sampling_rate, sr : int - sampling rate of audio data. Required if `input` is a raw audio file, is - a bytes object or None (i.e., read from microphone). - sample_width, sw : int - number of bytes used to encode one audio sample, typically 1, 2 or 4. - Required for raw data, see `sampling_rate`. - channels, ch : int - number of channels of audio data. Required for raw data, see - `sampling_rate`. - use_channel, uc : {None, "mix"} or int - which channel to use for split if `input` has multiple audio channels. - Regardless of which channel is used for splitting, returned audio events - contain data from *all* channels, just as `input`. - The following values are accepted: - - - None (alias "any"): accept audio activity from any channel, even if - other channels are silent. This is the default behavior. - - - "mix" ("avg" or "average"): mix down all channels (i.e. compute - average channel) and split the resulting channel. - - - int (0 <=, > `channels`): use one channel, specified by integer id, - for split. - - large_file : bool, default: False - If True, AND if `input` is a path to a *wav* of a *raw* audio file - (and only these two formats) then audio data is lazily loaded to memory - (i.e., one analysis window a time). Otherwise the whole file is loaded - to memory before split. Set to True if the size of the file is larger - than available memory. - max_read, mr : float, default: None, read until end of stream - maximum data to read from source in seconds. - validator, val : callable, DataValidator - custom data validator. If `None` (default), an `AudioEnergyValidor` is - used with the given energy threshold. Can be a callable or an instance - of `DataValidator` that implements `is_valid`. In either case, it'll be - called with with a window of audio data as the first parameter. - energy_threshold, eth : float, default: 50 - energy threshold for audio activity detection. Audio regions that have - enough windows of with a signal energy equal to or above this threshold - are considered valid audio events. Here we are referring to this amount - as the energy of the signal but to be more accurate, it is the log - energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see - :class:`AudioEnergyValidator` and - :func:`calculate_energy_single_channel`). If `validator` is given, this - argument is ignored. - - Yields - ------ - AudioRegion - a generator of detected :class:`AudioRegion` s. - """ - if min_dur <= 0: - raise ValueError("'min_dur' ({}) must be > 0".format(min_dur)) - if max_dur <= 0: - raise ValueError("'max_dur' ({}) must be > 0".format(max_dur)) - if max_silence < 0: - raise ValueError("'max_silence' ({}) must be >= 0".format(max_silence)) - - if isinstance(input, AudioReader): - source = input - analysis_window = source.block_dur - else: - analysis_window = kwargs.get( - "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW) - ) - if analysis_window <= 0: - raise ValueError( - "'analysis_window' ({}) must be > 0".format(analysis_window) - ) - - params = kwargs.copy() - params["max_read"] = params.get("max_read", params.get("mr")) - params["audio_format"] = params.get("audio_format", params.get("fmt")) - if isinstance(input, AudioRegion): - params["sampling_rate"] = input.sr - params["sample_width"] = input.sw - params["channels"] = input.ch - input = bytes(input) - try: - source = AudioReader(input, block_dur=analysis_window, **params) - except TooSamllBlockDuration as exc: - err_msg = "Too small 'analysis_windows' ({0}) for sampling rate " - err_msg += "({1}). Analysis windows should at least be 1/{1} to " - err_msg += "cover one single data sample" - raise ValueError(err_msg.format(exc.block_dur, exc.sampling_rate)) - - validator = kwargs.get("validator", kwargs.get("val")) - if validator is None: - energy_threshold = kwargs.get( - "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD) - ) - use_channel = kwargs.get("use_channel", kwargs.get("uc")) - validator = AudioEnergyValidator( - energy_threshold, source.sw, source.ch, use_channel=use_channel - ) - mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0 - if strict_min_dur: - mode |= StreamTokenizer.STRICT_MIN_LENGTH - min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil) - max_length = _duration_to_nb_windows( - max_dur, analysis_window, math.floor, _EPSILON - ) - max_continuous_silence = _duration_to_nb_windows( - max_silence, analysis_window, math.floor, _EPSILON - ) - - err_msg = "({0} sec.) results in {1} analysis window(s) " - err_msg += "({1} == {6}({0} / {2})) which is {5} the number " - err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))" - if min_length > max_length: - err_msg = "'min_dur' " + err_msg - raise ValueError( - err_msg.format( - min_dur, - min_length, - analysis_window, - max_length, - max_dur, - "higher than", - "ceil", - ) - ) - - if max_continuous_silence >= max_length: - err_msg = "'max_silence' " + err_msg - raise ValueError( - err_msg.format( - max_silence, - max_continuous_silence, - analysis_window, - max_length, - max_dur, - "higher or equal to", - "floor", - ) - ) - - tokenizer = StreamTokenizer( - validator, min_length, max_length, max_continuous_silence, mode=mode - ) - source.open() - token_gen = tokenizer.tokenize(source, generator=True) - region_gen = ( - _make_audio_region( - token[0], - token[1], - source.block_dur, - source.sr, - source.sw, - source.ch, - ) - for token in token_gen - ) - return region_gen - - -def _duration_to_nb_windows( - duration, analysis_window, round_fn=round, epsilon=0 -): - """ - Converts a given duration into a positive integer of analysis windows. - if `duration / analysis_window` is not an integer, the result will be - rounded to the closest bigger integer. If `duration == 0`, returns `0`. - If `duration < analysis_window`, returns 1. - `duration` and `analysis_window` can be in seconds or milliseconds but - must be in the same unit. - - Parameters - ---------- - duration : float - a given duration in seconds or ms. - analysis_window: float - size of analysis window, in the same unit as `duration`. - round_fn : callable - function called to round the result. Default: `round`. - epsilon : float - small value to add to the division result before rounding. - E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with - `round_fn=math.floor` returns `2` instead of `3`. Adding a small value - to `0.3 / 0.1` avoids this error. - - Returns - ------- - nb_windows : int - minimum number of `analysis_window`'s to cover `durartion`. That means - that `analysis_window * nb_windows >= duration`. - """ - if duration < 0 or analysis_window <= 0: - err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0" - raise ValueError(err_msg.format(duration, analysis_window)) - if duration == 0: - return 0 - return int(round_fn(duration / analysis_window + epsilon)) - - -def _make_audio_region( - data_frames, - start_frame, - frame_duration, - sampling_rate, - sample_width, - channels, -): - """ - Helper function to create an `AudioRegion` from parameters returned by - tokenization object. It takes care of setting up region `start` and `end` - in metadata. - - Parameters - ---------- - frame_duration: float - duration of analysis window in seconds - start_frame : int - index of the fisrt analysis window - samling_rate : int - sampling rate of audio data - sample_width : int - number of bytes of one audio sample - channels : int - number of channels of audio data - - Returns - ------- - audio_region : AudioRegion - AudioRegion whose start time is calculeted as: - `1000 * start_frame * frame_duration` - """ - start = start_frame * frame_duration - data = b"".join(data_frames) - duration = len(data) / (sampling_rate * sample_width * channels) - meta = {"start": start, "end": start + duration} - return AudioRegion(data, sampling_rate, sample_width, channels, meta) - - -def _read_chunks_online(max_read, **kwargs): - """ - Helper function to read audio data from an online blocking source - (i.e., microphone). Used to build an `AudioRegion` and can intercept - KeyboardInterrupt so that reading stops as soon as this exception is - raised. Makes building `AudioRegion`s on [i]python sessions and jupyter - notebooks more user friendly. - - Parameters - ---------- - max_read : float - maximum amount of data to read in seconds. - kwargs : - audio parameters (sampling_rate, sample_width and channels). - - See also - -------- - `AudioRegion.build` - """ - reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs) - reader.open() - data = [] - try: - while True: - frame = reader.read() - if frame is None: - break - data.append(frame) - except KeyboardInterrupt: - # Stop data acquisition from microphone when pressing - # Ctrl+C on a [i]python session or a notebook - pass - reader.close() - return ( - b"".join(data), - reader.sampling_rate, - reader.sample_width, - reader.channels, - ) - - -def _read_offline(input, skip=0, max_read=None, **kwargs): - """ - Helper function to read audio data from an offline (i.e., file). Used to - build `AudioRegion`s. - - Parameters - ---------- - input : str, bytes - path to audio file (if str), or a bytes object representing raw audio - data. - skip : float, default 0 - amount of data to skip from the begining of audio source. - max_read : float, default: None - maximum amount of audio data to read. Default: None, means read until - end of stream. - kwargs : - audio parameters (sampling_rate, sample_width and channels). - - See also - -------- - `AudioRegion.build` - - """ - audio_source = get_audio_source(input, **kwargs) - audio_source.open() - if skip is not None and skip > 0: - skip_samples = round(skip * audio_source.sampling_rate) - audio_source.read(skip_samples) - if max_read is not None: - if max_read < 0: - max_read = None - else: - max_read = round(max_read * audio_source.sampling_rate) - data = audio_source.read(max_read) - audio_source.close() - return ( - data, - audio_source.sampling_rate, - audio_source.sample_width, - audio_source.channels, - ) - - -def _check_convert_index(index, types, err_msg): - if not isinstance(index, slice) or index.step is not None: - raise TypeError(err_msg) - start = index.start if index.start is not None else 0 - stop = index.stop - for index in (start, stop): - if index is not None and not isinstance(index, types): - raise TypeError(err_msg) - return start, stop - - -class _SecondsView: - """A class to create a view of `AudioRegion` that can be sliced using - indices in seconds. - """ - - def __init__(self, region): - self._region = region - - def __getitem__(self, index): - err_msg = "Slicing AudioRegion by seconds requires indices of type " - err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])" - start_s, stop_s = _check_convert_index(index, (int, float), err_msg) - sr = self._region.sampling_rate - start_sample = int(start_s * sr) - stop_sample = None if stop_s is None else round(stop_s * sr) - return self._region[start_sample:stop_sample] - - @property - def len(self): - """ - Return region duration in seconds. - """ - return self._region.duration - - -class _MillisView(_SecondsView): - """A class to create a view of `AudioRegion` that can be sliced using - indices in milliseconds. - """ - - def __getitem__(self, index): - err_msg = ( - "Slicing AudioRegion by milliseconds requires indices of type " - ) - err_msg += "'int' without a step (e.g. region.sec[500:1500])" - start_ms, stop_ms = _check_convert_index(index, (int), err_msg) - start_sec = start_ms / 1000 - stop_sec = None if stop_ms is None else stop_ms / 1000 - index = slice(start_sec, stop_sec) - return super(_MillisView, self).__getitem__(index) - - def __len__(self): - """ - Return region duration in milliseconds. - """ - return round(self._region.duration * 1000) - - @property - def len(self): - """ - Return region duration in milliseconds. - """ - return len(self) - - -class _AudioRegionMetadata(dict): - """A class to store `AudioRegion`'s metadata.""" - - def __getattr__(self, name): - if name in self: - return self[name] - else: - err_msg = "AudioRegion metadata has no entry '{}'" - raise AttributeError(err_msg.format(name)) - - def __setattr__(self, name, value): - self[name] = value - - def __str__(self): - return "\n".join("{}: {}".format(k, v) for k, v in self.items()) - - def __repr__(self): - return str(self) - - -class AudioRegion(object): - """ - AudioRegion encapsulates raw audio data and provides an interface to - perform simple operations on it. Use `AudioRegion.load` to build an - `AudioRegion` from different types of objects. - - Parameters - ---------- - data : bytes - raw audio data as a bytes object - sampling_rate : int - sampling rate of audio data - sample_width : int - number of bytes of one audio sample - channels : int - number of channels of audio data - meta : dict, default: None - any collection of elements used to build metadata for - this `AudioRegion`. Meta data can be accessed via `region.meta.key` - if `key` is a valid python attribute name, or via `region.meta[key]` - if not. Note that the :func:`split` function (or the - :meth:`AudioRegion.split` method) returns `AudioRegions` with a ``start`` - and a ``stop`` meta values that indicate the location in seconds of the - region in original audio data. - - See also - -------- - AudioRegion.load - - """ - - def __init__(self, data, sampling_rate, sample_width, channels, meta=None): - check_audio_data(data, sample_width, channels) - self._data = data - self._sampling_rate = sampling_rate - self._sample_width = sample_width - self._channels = channels - self._samples = None - self.splitp = self.split_and_plot - - if meta is not None: - self._meta = _AudioRegionMetadata(meta) - else: - self._meta = None - - self._seconds_view = _SecondsView(self) - self.sec = self.seconds - self.s = self.seconds - - self._millis_view = _MillisView(self) - self.ms = self.millis - - @property - def meta(self): - return self._meta - - @meta.setter - def meta(self, new_meta): - """Meta data of audio region.""" - self._meta = _AudioRegionMetadata(new_meta) - - @classmethod - def load(cls, input, skip=0, max_read=None, **kwargs): - """ - Create an `AudioRegion` by loading data from `input`. See :func:`load` - for parameters descripion. - - Returns - ------- - region: AudioRegion - - Raises - ------ - ValueError - raised if `input` is None and `skip` != 0 or `max_read` is None. - """ - if input is None: - if skip > 0: - raise ValueError( - "'skip' should be 0 when reading from microphone" - ) - if max_read is None or max_read < 0: - raise ValueError( - "'max_read' should not be None when reading from " - "microphone" - ) - data, sampling_rate, sample_width, channels = _read_chunks_online( - max_read, **kwargs - ) - else: - data, sampling_rate, sample_width, channels = _read_offline( - input, skip=skip, max_read=max_read, **kwargs - ) - - return cls(data, sampling_rate, sample_width, channels) - - @property - def seconds(self): - """ - A view to slice audio region by seconds (using ``region.seconds[start:end]``). - """ - return self._seconds_view - - @property - def millis(self): - """A view to slice audio region by milliseconds (using ``region.millis[start:end]``).""" - return self._millis_view - - @property - def duration(self): - """ - Returns region duration in seconds. - """ - return len(self._data) / ( - self.sampling_rate * self.sample_width * self.channels - ) - - @property - def sampling_rate(self): - """Samling rate of audio data.""" - return self._sampling_rate - - @property - def sr(self): - """Samling rate of audio data, alias for `sampling_rate`.""" - return self._sampling_rate - - @property - def sample_width(self): - """Number of bytes per sample, one channel considered.""" - return self._sample_width - - @property - def sw(self): - """Number of bytes per sample, alias for `sampling_rate`.""" - return self._sample_width - - @property - def channels(self): - """Number of channels of audio data.""" - return self._channels - - @property - def ch(self): - """Number of channels of audio data, alias for `channels`.""" - return self._channels - - def play(self, progress_bar=False, player=None, **progress_bar_kwargs): - """ - Play audio region. - - Parameters - ---------- - progress_bar : bool, default: False - whether to use a progress bar while playing audio. Default: False. - `progress_bar` requires `tqdm`, if not installed, no progress bar - will be shown. - player : AudioPalyer, default: None - audio player to use. if None (default), use `player_for()` - to get a new audio player. - progress_bar_kwargs : kwargs - keyword arguments to pass to `tqdm` progress_bar builder (e.g., - use `leave=False` to clean up the screen when play finishes). - """ - if player is None: - player = player_for(self) - player.play( - self._data, progress_bar=progress_bar, **progress_bar_kwargs - ) - - def save(self, file, audio_format=None, exists_ok=True, **audio_parameters): - """ - Save audio region to file. - - Parameters - ---------- - file : str - path to output audio file. May contain `{duration}` placeholder - as well as any place holder that this region's metadata might - contain (e.g., regions returned by `split` contain metadata with - `start` and `end` attributes that can be used to build output file - name as `{meta.start}` and `{meta.end}`. See examples using - placeholders with formatting. - - audio_format : str, default: None - format used to save audio data. If None (default), format is guessed - from file name's extension. If file name has no extension, audio - data is saved as a raw (headerless) audio file. - exists_ok : bool, default: True - If True, overwrite `file` if a file with the same name exists. - If False, raise an `IOError` if `file` exists. - audio_parameters: dict - any keyword arguments to be passed to audio saving backend. - - Returns - ------- - file: str - name of output file with replaced placehoders. - Raises - IOError if `file` exists and `exists_ok` is False. - - - Examples - -------- - >>> region = AudioRegion(b'\\0' * 2 * 24000, - >>> sampling_rate=16000, - >>> sample_width=2, - >>> channels=1) - >>> region.meta.start = 2.25 - >>> region.meta.end = 2.25 + region.duration - >>> region.save('audio_{meta.start}-{meta.end}.wav') - >>> audio_2.25-3.75.wav - >>> region.save('region_{meta.start:.3f}_{duration:.3f}.wav') - audio_2.250_1.500.wav - """ - if isinstance(file, str): - file = file.format(duration=self.duration, meta=self.meta) - if not exists_ok and os.path.exists(file): - raise FileExistsError("file '{file}' exists".format(file=file)) - to_file( - self._data, - file, - audio_format, - sr=self.sr, - sw=self.sw, - ch=self.ch, - audio_parameters=audio_parameters, - ) - return file - - def split( - self, - min_dur=0.2, - max_dur=5, - max_silence=0.3, - drop_trailing_silence=False, - strict_min_dur=False, - **kwargs - ): - """Split audio region. See :func:`auditok.split()` for a comprehensive - description of split parameters. - See Also :meth:`AudioRegio.split_and_plot`. - """ - if kwargs.get("max_read", kwargs.get("mr")) is not None: - warn_msg = "'max_read' (or 'mr') should not be used with " - warn_msg += "AudioRegion.split_and_plot(). You should rather " - warn_msg += "slice audio region before calling this method" - raise RuntimeWarning(warn_msg) - return split( - self, - min_dur=min_dur, - max_dur=max_dur, - max_silence=max_silence, - drop_trailing_silence=drop_trailing_silence, - strict_min_dur=strict_min_dur, - **kwargs - ) - - def plot( - self, - scale_signal=True, - show=True, - figsize=None, - save_as=None, - dpi=120, - theme="auditok", - ): - """Plot audio region, one sub-plot for each channel. - - Parameters - ---------- - scale_signal : bool, default: True - if true, scale signal by subtracting its mean and dividing by its - standard deviation before plotting. - show : bool - whether to show plotted signal right after the call. - figsize : tuple, default: None - width and height of the figure to pass to `matplotlib`. - save_as : str, default None. - if provided, also save plot to file. - dpi : int, default: 120 - plot dpi to pass to `matplotlib`. - theme : str or dict, default: "auditok" - plot theme to use. Currently only "auditok" theme is implemented. To - provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`. - """ - try: - from auditok.plotting import plot - - plot( - self, - scale_signal=scale_signal, - show=show, - figsize=figsize, - save_as=save_as, - dpi=dpi, - theme=theme, - ) - except ImportError: - raise RuntimeWarning("Plotting requires matplotlib") - - def split_and_plot( - self, - min_dur=0.2, - max_dur=5, - max_silence=0.3, - drop_trailing_silence=False, - strict_min_dur=False, - scale_signal=True, - show=True, - figsize=None, - save_as=None, - dpi=120, - theme="auditok", - **kwargs - ): - """Split region and plot signal and detections. Alias: :meth:`splitp`. - See :func:`auditok.split()` for a comprehensive description of split - parameters. Also see :meth:`plot` for plot parameters. - """ - try: - from auditok.plotting import plot - - regions = self.split( - min_dur=min_dur, - max_dur=max_dur, - max_silence=max_silence, - drop_trailing_silence=drop_trailing_silence, - strict_min_dur=strict_min_dur, - **kwargs - ) - regions = list(regions) - detections = ((reg.meta.start, reg.meta.end) for reg in regions) - eth = kwargs.get( - "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD) - ) - plot( - self, - scale_signal=scale_signal, - detections=detections, - energy_threshold=eth, - show=show, - figsize=figsize, - save_as=save_as, - dpi=dpi, - theme=theme, - ) - return regions - except ImportError: - raise RuntimeWarning("Plotting requires matplotlib") +This module gathers processing (i.e. tokenization) classes. - def __array__(self): - return self.samples - - @property - def samples(self): - """Audio region as arrays of samples, one array per channel.""" - if self._samples is None: - self._samples = signal.to_array( - self._data, self.sample_width, self.channels - ) - return self._samples - - def __len__(self): - """ - Return region length in number of samples. - """ - return len(self._data) // (self.sample_width * self.channels) - - @property - def len(self): - """ - Return region length in number of samples. - """ - return len(self) - - def __bytes__(self): - return self._data - - def __str__(self): - return ( - "AudioRegion(duration={:.3f}, " - "sampling_rate={}, sample_width={}, channels={})".format( - self.duration, self.sr, self.sw, self.ch - ) - ) - - def __repr__(self): - return str(self) - - def __add__(self, other): - """ - Concatenates this region and `other` and return a new region. - Both regions must have the same sampling rate, sample width - and number of channels. If not, raises a `ValueError`. - """ - if not isinstance(other, AudioRegion): - raise TypeError( - "Can only concatenate AudioRegion, " - 'not "{}"'.format(type(other)) - ) - if other.sr != self.sr: - raise ValueError( - "Can only concatenate AudioRegions of the same " - "sampling rate ({} != {})".format(self.sr, other.sr) - ) - if other.sw != self.sw: - raise ValueError( - "Can only concatenate AudioRegions of the same " - "sample width ({} != {})".format(self.sw, other.sw) - ) - if other.ch != self.ch: - raise ValueError( - "Can only concatenate AudioRegions of the same " - "number of channels ({} != {})".format(self.ch, other.ch) - ) - data = self._data + other._data - return AudioRegion(data, self.sr, self.sw, self.ch) - - def __radd__(self, other): - """ - Concatenates `other` and this region. `other` should be an - `AudioRegion` with the same audio parameters as this region - but can exceptionally be `0` to make it possible to concatenate - many regions with `sum`. - """ - if other == 0: - return self - return other.add(self) +Class summary +============= - def __mul__(self, n): - if not isinstance(n, int): - err_msg = "Can't multiply AudioRegion by a non-int of type '{}'" - raise TypeError(err_msg.format(type(n))) - data = self._data * n - return AudioRegion(data, self.sr, self.sw, self.ch) - - def __rmul__(self, n): - return self * n - - def __truediv__(self, n): - if not isinstance(n, int) or n <= 0: - raise TypeError("AudioRegion can only be divided by a positive int") - samples_per_sub_region, rest = divmod(len(self), n) - onset = 0 - sub_regions = [] - while onset < len(self): - offset = 0 - if rest > 0: - offset = 1 - rest -= 1 - offset += onset + samples_per_sub_region - sub_regions.append(self[onset:offset]) - onset = offset - return sub_regions - - def __eq__(self, other): - if other is self: - return True - if not isinstance(other, AudioRegion): - return False - return ( - (self._data == other._data) - and (self.sr == other.sr) - and (self.sw == other.sw) - and (self.ch == other.ch) - ) - - def __getitem__(self, index): - err_msg = "Slicing AudioRegion by samples requires indices of type " - err_msg += "'int' without a step (e.g. region.sec[1600:3200])" - start_sample, stop_sample = _check_convert_index(index, (int), err_msg) - - bytes_per_sample = self.sample_width * self.channels - len_samples = len(self._data) // bytes_per_sample +.. autosummary:: - if start_sample < 0: - start_sample = max(start_sample + len_samples, 0) - onset = start_sample * bytes_per_sample + StreamTokenizer +""" - if stop_sample is not None: - if stop_sample < 0: - stop_sample = max(stop_sample + len_samples, 0) - offset = index.stop * bytes_per_sample - else: - offset = None +from auditok.util import DataValidator - data = self._data[onset:offset] - return AudioRegion(data, self.sr, self.sw, self.ch) +__all__ = ["StreamTokenizer"] -class StreamTokenizer: +class StreamTokenizer(): """ Class for stream tokenizers. It implements a 4-state automaton scheme to extract sub-sequences of interest on the fly. - - Parameters - ---------- - validator : callable, DataValidator (must implement `is_valid`) - called with each data frame read from source. Should take one positional - argument and return True or False for valid and invalid frames - respectively. - - min_length : int - Minimum number of frames of a valid token. This includes all - tolerated non valid frames within the token. - - max_length : int - Maximum number of frames of a valid token. This includes all - tolerated non valid frames within the token. - - max_continuous_silence : int - Maximum number of consecutive non-valid frames within a token. - Note that, within a valid token, there may be many tolerated - *silent* regions that contain each a number of non valid frames up - to `max_continuous_silence` - - init_min : int - Minimum number of consecutive valid frames that must be - **initially** gathered before any sequence of non valid frames can - be tolerated. This option is not always needed, it can be used to - drop non-valid tokens as early as possible. **Default = 0** means - that the option is by default ineffective. - - init_max_silence : int - Maximum number of tolerated consecutive non-valid frames if the - number already gathered valid frames has not yet reached - 'init_min'.This argument is normally used if `init_min` is used. - **Default = 0**, by default this argument is not taken into - consideration. - - mode : int - mode can be one of the following: - - -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and - accept a token shorter than `min_length` if it is the continuation - of the latest delivered token. - - -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered - because `max_length` is reached, and token `i+1` is immediately - adjacent to token `i` (i.e. token `i` ends at frame `k` and token - `i+1` starts at frame `k+1`) then accept token `i+1` only of it has - a size of at least `min_length`. The default behavior is to accept - token `i+1` event if it is shorter than `min_length` (provided that - the above conditions are fulfilled of course). - - -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing - non-valid frames from a token to be delivered if and only if it - is not **truncated**. This can be a bit tricky. A token is actually - delivered if: - - - `max_continuous_silence` is reached. - - - Its length reaches `max_length`. This is referred to as a - **truncated** token. - - In the current implementation, a `StreamTokenizer`'s decision is only - based on already seen data and on incoming data. Thus, if a token is - truncated at a non-valid but tolerated frame (`max_length` is reached - but `max_continuous_silence` not yet) any tailing silence will be kept - because it can potentially be part of valid token (if `max_length` was - bigger). But if `max_continuous_silence` is reached before - `max_length`, the delivered token will not be considered as truncated - but a result of *normal* end of detection (i.e. no more valid data). - In that case the trailing silence can be removed if you use the - `StreamTokenizer.DROP_TRAILING_SILENCE` mode. - - -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`: - use both options. That means: first remove tailing silence, then - check if the token still has a length of at least `min_length`. - - - - - Examples - -------- - - In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is - accepted although it is shorter than `min_length` (3), because it - immediately follows the latest delivered token: - - >>> from auditok.core import StreamTokenizer - >>> from StringDataSource, DataValidator - - >>> class UpperCaseChecker(DataValidator): - >>> def is_valid(self, frame): - return frame.isupper() - >>> dsource = StringDataSource("aaaAAAABBbbb") - >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(), - min_length=3, - max_length=4, - max_continuous_silence=0) - >>> tokenizer.tokenize(dsource) - [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)] - - - The following tokenizer will however reject the 'BB' token: - - >>> dsource = StringDataSource("aaaAAAABBbbb") - >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(), - min_length=3, max_length=4, - max_continuous_silence=0, - mode=StreamTokenizer.STRICT_MIN_LENGTH) - >>> tokenizer.tokenize(dsource) - [(['A', 'A', 'A', 'A'], 3, 6)] - - - - >>> tokenizer = StreamTokenizer( - >>> validator=UpperCaseChecker(), - >>> min_length=3, - >>> max_length=6, - >>> max_continuous_silence=3, - >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE - >>> ) - >>> dsource = StringDataSource("aaaAAAaaaBBbbbb") - >>> tokenizer.tokenize(dsource) - [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)] - - The first token is delivered with its tailing silence because it is - truncated while the second one has its tailing frames removed. - - Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be: - - .. code:: python - - [ - (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), - (['B', 'B', 'b', 'b', 'b'], 9, 13) - ] - + + :Parameters: + + `validator` : + instance of `DataValidator` that implements `is_valid` method. + + `min_length` : *(int)* + Minimum number of frames of a valid token. This includes all \ + tolerated non valid frames within the token. + + `max_length` : *(int)* + Maximum number of frames of a valid token. This includes all \ + tolerated non valid frames within the token. + + `max_continuous_silence` : *(int)* + Maximum number of consecutive non-valid frames within a token. + Note that, within a valid token, there may be many tolerated \ + *silent* regions that contain each a number of non valid frames up to \ + `max_continuous_silence` + + `init_min` : *(int, default=0)* + Minimum number of consecutive valid frames that must be **initially** \ + gathered before any sequence of non valid frames can be tolerated. This + option is not always needed, it can be used to drop non-valid tokens as + early as possible. **Default = 0** means that the option is by default + ineffective. + + `init_max_silence` : *(int, default=0)* + Maximum number of tolerated consecutive non-valid frames if the \ + number already gathered valid frames has not yet reached 'init_min'. + This argument is normally used if `init_min` is used. **Default = 0**, + by default this argument is not taken into consideration. + + `mode` : *(int, default=0)* + `mode` can be: + + 1. `StreamTokenizer.STRICT_MIN_LENGTH`: + if token *i* is delivered because `max_length` + is reached, and token *i+1* is immediately adjacent to + token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts + at frame *k+1*) then accept token *i+1* only of it has a size of at + least `min_length`. The default behavior is to accept token *i+1* + event if it is shorter than `min_length` (given that the above conditions + are fulfilled of course). + + :Examples: + + In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is + accepted although it is shorter than `min_length` (3), because it immediately + follows the latest delivered token: + + .. code:: python + + from auditok import StreamTokenizer, StringDataSource, DataValidator + + class UpperCaseChecker(DataValidator): + def is_valid(self, frame): + return frame.isupper() + + + dsource = StringDataSource("aaaAAAABBbbb") + tokenizer = StreamTokenizer(validator=UpperCaseChecker(), + min_length=3, + max_length=4, + max_continuous_silence=0) + + tokenizer.tokenize(dsource) + + + :output: + + .. code:: python + + [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)] + + + The following tokenizer will however reject the 'BB' token: + + .. code:: python + + dsource = StringDataSource("aaaAAAABBbbb") + tokenizer = StreamTokenizer(validator=UpperCaseChecker(), + min_length=3, max_length=4, + max_continuous_silence=0, + mode=StreamTokenizer.STRICT_MIN_LENGTH) + tokenizer.tokenize(dsource) + + :output: + + .. code:: python + + [(['A', 'A', 'A', 'A'], 3, 6)] + + + 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames + from a token to be delivered if and only if it is not **truncated**. + This can be a bit tricky. A token is actually delivered if: + + - a. `max_continuous_silence` is reached + + :or: + + - b. Its length reaches `max_length`. This is called a **truncated** token + + In the current implementation, a `StreamTokenizer`'s decision is only based on already seen + data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated + frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing + silence will be kept because it can potentially be part of valid token (if `max_length` + was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered + token will not be considered as truncated but a result of *normal* end of detection + (i.e. no more valid data). In that case the tailing silence can be removed if you use + the `StreamTokenizer.DROP_TRAILING_SILENCE` mode. + + :Example: + + .. code:: python + + tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3, + max_length=6, max_continuous_silence=3, + mode=StreamTokenizer.DROP_TRAILING_SILENCE) + + dsource = StringDataSource("aaaAAAaaaBBbbbb") + tokenizer.tokenize(dsource) + + :output: + + .. code:: python + + [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)] + + The first token is delivered with its tailing silence because it is truncated + while the second one has its tailing frames removed. + + Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be: + + .. code:: python + + [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)] + + + + 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`: + use both options. That means: first remove tailing silence, then ckeck if the + token still has at least a length of `min_length`. """ - + + SILENCE = 0 POSSIBLE_SILENCE = 1 - POSSIBLE_NOISE = 2 + POSSIBLE_NOISE = 2 NOISE = 3 - NORMAL = 0 + STRICT_MIN_LENGTH = 2 DROP_TRAILING_SILENCE = 4 - - def __init__( - self, - validator, - min_length, - max_length, - max_continuous_silence, - init_min=0, - init_max_silence=0, - mode=0, - ): - if callable(validator): - self._is_valid = validator - elif isinstance(validator, DataValidator): - self._is_valid = validator.is_valid - else: - raise TypeError( - "'validator' must be a callable or an instance of " - "DataValidator" - ) - + # alias + DROP_TAILING_SILENCE = 4 + + def __init__(self, validator, + min_length, max_length, max_continuous_silence, + init_min=0, init_max_silence=0, + mode=0): + + if not isinstance(validator, DataValidator): + raise TypeError("'validator' must be an instance of 'DataValidator'") + if max_length <= 0: - raise ValueError( - "'max_length' must be > 0 (value={0})".format(max_length) - ) - + raise ValueError("'max_length' must be > 0 (value={0})".format(max_length)) + if min_length <= 0 or min_length > max_length: - err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})" - raise ValueError(err_msg.format(min_length)) - + raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length)) + if max_continuous_silence >= max_length: - err_msg = "'max_continuous_silence' must be < 'max_length' " - err_msg += "(value={0})" - raise ValueError(err_msg.format(max_continuous_silence)) - + raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence)) + if init_min >= max_length: - raise ValueError( - "'init_min' must be < 'max_length' (value={0})".format( - max_continuous_silence - ) - ) - + raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence)) + self.validator = validator self.min_length = min_length self.max_length = max_length self.max_continuous_silence = max_continuous_silence self.init_min = init_min self.init_max_silent = init_max_silence - self._set_mode(mode) + + self._mode = None + self.set_mode(mode) + self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 + self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 + self._deliver = None self._tokens = None self._state = None self._data = None self._contiguous_token = False + self._init_count = 0 self._silence_length = 0 self._start_frame = 0 self._current_frame = 0 - - def _set_mode(self, mode): - strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH - strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE - if mode not in [ - StreamTokenizer.NORMAL, - StreamTokenizer.STRICT_MIN_LENGTH, - StreamTokenizer.DROP_TRAILING_SILENCE, - strict_min_and_drop_trailing, - ]: + + def set_mode(self, mode): + """ + :Parameters: + + `mode` : *(int)* + New mode, must be one of: + + + - `StreamTokenizer.STRICT_MIN_LENGTH` + + - `StreamTokenizer.DROP_TRAILING_SILENCE` + + - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE` + + - `0` + + See `StreamTokenizer.__init__` for more information about the mode. + """ + + if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE, + self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]: + raise ValueError("Wrong value for mode") + self._mode = mode self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 - self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 - + self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 + + + def get_mode(self): + """ + Return the current mode. To check whether a specific mode is activated use + the bitwise 'and' operator `&`. Example: + + .. code:: python + + if mode & self.STRICT_MIN_LENGTH != 0: + do_something() + """ + return self._mode + def _reinitialize(self): self._contiguous_token = False self._data = [] @@ -1269,114 +266,112 @@ class StreamTokenizer: self._state = self.SILENCE self._current_frame = -1 self._deliver = self._append_token - - def tokenize(self, data_source, callback=None, generator=False): + + + def tokenize(self, data_source, callback=None): """ - Read data from `data_source`, one frame a time, and process the read - frames in order to detect sequences of frames that make up valid - tokens. - + Read data from `data_source`, one frame a time, and process the read frames in + order to detect sequences of frames that make up valid tokens. + :Parameters: - `data_source` : instance of the :class:`DataSource` class that - implements a `read` method. 'read' should return a slice of - signal, i.e. frame (of whatever type as long as it can be - processed by validator) and None if there is no more signal. - + `data_source` : instance of the :class:`DataSource` class that implements a `read` method. + 'read' should return a slice of signal, i.e. frame (of whatever \ + type as long as it can be processed by validator) and None if \ + there is no more signal. + `callback` : an optional 3-argument function. - If a `callback` function is given, it will be called each time - a valid token is found. - - + If a `callback` function is given, it will be called each time a valid token + is found. + + :Returns: - A list of tokens if `callback` is None. Each token is tuple with the - following elements: - + A list of tokens if `callback` is None. Each token is tuple with the following elements: + .. code python - + (data, start, end) - - where `data` is a list of read frames, `start`: index of the first - frame in the original data and `end` : index of the last frame. + + where `data` is a list of read frames, `start`: index of the first frame in the + original data and `end` : index of the last frame. + """ - token_gen = self._iter_tokens(data_source) - if callback: - for token in token_gen: - callback(*token) - return - if generator: - return token_gen - return list(token_gen) - - def _iter_tokens(self, data_source): + self._reinitialize() + + if callback is not None: + self._deliver = callback + while True: - frame = data_source.read() - self._current_frame += 1 + frame = data_source.read() if frame is None: - token = self._post_process() - if token is not None: - yield token break - token = self._process(frame) - if token is not None: - yield token - - def _process(self, frame): # noqa: C901 - - frame_is_valid = self._is_valid(frame) - + self._current_frame += 1 + self._process(frame) + + self._post_process() + + if callback is None: + _ret = self._tokens + self._tokens = None + return _ret + + + def _process(self, frame): + + frame_is_valid = self.validator.is_valid(frame) + if self._state == self.SILENCE: - + if frame_is_valid: # seems we got a valid frame after a silence self._init_count = 1 self._silence_length = 0 self._start_frame = self._current_frame self._data.append(frame) - - if self._init_count >= self.init_min: + + if self._init_count >= self.init_min: self._state = self.NOISE if len(self._data) >= self.max_length: - return self._process_end_of_detection(True) + self._process_end_of_detection(True) else: self._state = self.POSSIBLE_NOISE - + elif self._state == self.POSSIBLE_NOISE: - + if frame_is_valid: self._silence_length = 0 self._init_count += 1 self._data.append(frame) - if self._init_count >= self.init_min: + if self._init_count >= self.init_min: self._state = self.NOISE if len(self._data) >= self.max_length: - return self._process_end_of_detection(True) - - else: + self._process_end_of_detection(True) + + else: self._silence_length += 1 - if ( - self._silence_length > self.init_max_silent - or len(self._data) + 1 >= self.max_length - ): + if self._silence_length > self.init_max_silent or \ + len(self._data) + 1 >= self.max_length: # either init_max_silent or max_length is reached # before _init_count, back to silence self._data = [] self._state = self.SILENCE else: self._data.append(frame) - + + elif self._state == self.NOISE: - + if frame_is_valid: self._data.append(frame) if len(self._data) >= self.max_length: - return self._process_end_of_detection(True) - - elif self.max_continuous_silence <= 0: - # max token reached at this frame will _deliver if - # _contiguous_token and not _strict_min_length + self._process_end_of_detection(True) + + elif self.max_continuous_silence <= 0 : + # max token reached at this frame will _deliver if _contiguous_token + # and not _strict_min_length + self._process_end_of_detection() self._state = self.SILENCE - return self._process_end_of_detection() + else: # this is the first silent frame following a valid one # and it is tolerated @@ -1384,63 +379,61 @@ class StreamTokenizer: self._data.append(frame) self._state = self.POSSIBLE_SILENCE if len(self._data) == self.max_length: - return self._process_end_of_detection(True) - # don't reset _silence_length because we still + self._process_end_of_detection(True) + # don't reset _silence_length because we still # need to know the total number of silent frames - + + + elif self._state == self.POSSIBLE_SILENCE: - + if frame_is_valid: self._data.append(frame) self._silence_length = 0 self._state = self.NOISE if len(self._data) >= self.max_length: - return self._process_end_of_detection(True) - + self._process_end_of_detection(True) + else: if self._silence_length >= self.max_continuous_silence: - self._state = self.SILENCE if self._silence_length < len(self._data): - # _deliver only gathered frames aren't all silent - return self._process_end_of_detection() - self._data = [] + # _deliver only gathered frames aren't all silent + self._process_end_of_detection() + else: + self._data = [] + self._state = self.SILENCE self._silence_length = 0 else: self._data.append(frame) self._silence_length += 1 if len(self._data) >= self.max_length: - return self._process_end_of_detection(True) - # don't reset _silence_length because we still + self._process_end_of_detection(True) + # don't reset _silence_length because we still # need to know the total number of silent frames - + + def _post_process(self): if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE: if len(self._data) > 0 and len(self._data) > self._silence_length: - return self._process_end_of_detection() - + self._process_end_of_detection() + + def _process_end_of_detection(self, truncated=False): - - if ( - not truncated - and self._drop_trailing_silence - and self._silence_length > 0 - ): + + if not truncated and self._drop_tailing_silence and self._silence_length > 0: # happens if max_continuous_silence is reached # or max_length is reached at a silent frame - self._data = self._data[0 : -self._silence_length] - - if (len(self._data) >= self.min_length) or ( - len(self._data) > 0 - and not self._strict_min_length - and self._contiguous_token - ): - - start_frame = self._start_frame - end_frame = self._start_frame + len(self._data) - 1 - data = self._data - self._data = [] - token = (data, start_frame, end_frame) - + self._data = self._data[0: - self._silence_length] + + if (len(self._data) >= self.min_length) or \ + (len(self._data) > 0 and \ + not self._strict_min_length and self._contiguous_token): + + + + _end_frame = self._start_frame + len(self._data) - 1 + self._deliver(self._data, self._start_frame, _end_frame) + if truncated: # next token (if any) will start at _current_frame + 1 self._start_frame = self._current_frame + 1 @@ -1448,11 +441,12 @@ class StreamTokenizer: self._contiguous_token = True else: self._contiguous_token = False - return token else: - self._contiguous_token = False - + self._contiguous_token = False + self._data = [] - + + + def _append_token(self, data, start, end): self._tokens.append((data, start, end)) diff --git a/libs/auditok/dataset.py b/libs/auditok/dataset.py index 98dc5d1d4..1a3a7af5c 100644 --- a/libs/auditok/dataset.py +++ b/libs/auditok/dataset.py @@ -1,31 +1,19 @@ """ -This module contains links to audio files that can be used for test purposes. - -.. autosummary:: - :toctree: generated/ - - one_to_six_arabic_16000_mono_bc_noise - was_der_mensch_saet_mono_44100_lead_trail_silence +This module contains links to audio files you can use for test purposes. """ import os -__all__ = [ - "one_to_six_arabic_16000_mono_bc_noise", - "was_der_mensch_saet_mono_44100_lead_trail_silence", -] +__all__ = ["one_to_six_arabic_16000_mono_bc_noise", "was_der_mensch_saet_mono_44100_lead_trail_silence"] _current_dir = os.path.dirname(os.path.realpath(__file__)) one_to_six_arabic_16000_mono_bc_noise = "{cd}{sep}data{sep}1to6arabic_\ -16000_mono_bc_noise.wav".format( - cd=_current_dir, sep=os.path.sep -) +16000_mono_bc_noise.wav".format(cd=_current_dir, sep=os.path.sep) """A wave file that contains a pronunciation of Arabic numbers from 1 to 6""" + was_der_mensch_saet_mono_44100_lead_trail_silence = "{cd}{sep}data{sep}was_\ der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_trail_\ -silence.wav".format( - cd=_current_dir, sep=os.path.sep -) -"""A wave file that contains a sentence with a long leading and trailing silence""" +silence.wav".format(cd=_current_dir, sep=os.path.sep) +""" A wave file that contains a sentence between long leading and trailing periods of silence""" \ No newline at end of file diff --git a/libs/auditok/exceptions.py b/libs/auditok/exceptions.py index 7bc5054ee..0026a9d89 100644 --- a/libs/auditok/exceptions.py +++ b/libs/auditok/exceptions.py @@ -1,41 +1,9 @@ +""" +November 2015 +@author: Amine SEHILI +""" + class DuplicateArgument(Exception): pass -class TooSamllBlockDuration(ValueError): - """Raised when block_dur results in a block_size smaller than one sample.""" - - def __init__(self, message, block_dur, sampling_rate): - self.block_dur = block_dur - self.sampling_rate = sampling_rate - super(TooSamllBlockDuration, self).__init__(message) - - -class TimeFormatError(Exception): - """Raised when a duration formatting directive is unknown.""" - - -class EndOfProcessing(Exception): - """Raised within command line script's main function to jump to - postprocessing code.""" - - -class AudioIOError(Exception): - """Raised when a compressed audio file cannot be loaded or when trying - to read from a not yet open AudioSource""" - - -class AudioParameterError(AudioIOError): - """Raised when one audio parameter is missing when loading raw data or - saving data to a format other than raw. Also raised when an audio - parameter has a wrong value.""" - - -class AudioEncodingError(Exception): - """Raised if audio data can not be encoded in the provided format""" - - -class AudioEncodingWarning(RuntimeWarning): - """Raised if audio data can not be encoded in the provided format - but saved as wav. - """ diff --git a/libs/auditok/io.py b/libs/auditok/io.py index b5fb61a76..665ab274d 100644 --- a/libs/auditok/io.py +++ b/libs/auditok/io.py @@ -1,1021 +1,499 @@ """ Module for low-level audio input-output operations. -.. autosummary:: - :toctree: generated/ +Class summary +============= - AudioSource - Rewindable - BufferAudioSource - WaveAudioSource - PyAudioSource - StdinAudioSource - PyAudioPlayer - from_file - to_file - player_for -""" -import os -import sys -import wave -import warnings -from abc import ABC, abstractmethod -from functools import partial -from .exceptions import AudioIOError, AudioParameterError +.. autosummary:: -try: - from pydub import AudioSegment + AudioSource + Rewindable + BufferAudioSource + WaveAudioSource + PyAudioSource + StdinAudioSource + PyAudioPlayer + - _WITH_PYDUB = True -except ImportError: - _WITH_PYDUB = False +Function summary +================ -try: - from tqdm import tqdm as _tqdm +.. autosummary:: - DEFAULT_BAR_FORMAT_TQDM = "|" + "{bar}" + "|" + "[{elapsed}/{duration}]" - DEFAULT_NCOLS_TQDM = 30 - DEFAULT_NCOLS_TQDM = 30 - DEFAULT_MIN_INTERVAL_TQDM = 0.05 - _WITH_TQDM = True -except ImportError: - _WITH_TQDM = False + from_file + player_for +""" +from abc import ABCMeta, abstractmethod +import wave +import sys -__all__ = [ - "AudioSource", - "Rewindable", - "BufferAudioSource", - "RawAudioSource", - "WaveAudioSource", - "PyAudioSource", - "StdinAudioSource", - "PyAudioPlayer", - "from_file", - "to_file", - "player_for", -] +__all__ = ["AudioSource", "Rewindable", "BufferAudioSource", "WaveAudioSource", + "PyAudioSource", "StdinAudioSource", "PyAudioPlayer", "from_file", "player_for"] -DEFAULT_SAMPLING_RATE = 16000 +DEFAULT_SAMPLE_RATE = 16000 DEFAULT_SAMPLE_WIDTH = 2 DEFAULT_NB_CHANNELS = 1 -def check_audio_data(data, sample_width, channels): - sample_size_bytes = int(sample_width * channels) - nb_samples = len(data) // sample_size_bytes - if nb_samples * sample_size_bytes != len(data): - raise AudioParameterError( - "The length of audio data must be an integer " - "multiple of `sample_width * channels`" - ) - - -def _guess_audio_format(fmt, filename): - if fmt is None: - extension = os.path.splitext(filename.lower())[1][1:] - if extension: - fmt = extension - else: - return None - fmt = fmt.lower() - if fmt == "wave": - fmt = "wav" - return fmt - - -def _get_audio_parameters(param_dict): - """ - Get audio parameters from a dictionary of parameters. An audio parameter can - have a long name or a short name. If the long name is present, the short - name will be ignored. If neither is present then `AudioParameterError` is - raised. - - Expected parameters are: - - - `sampling_rate`, `sr` : int, sampling rate. - - - `sample_width`, `sw` : int, sample size in bytes. - - - `channels`, `ch` : int, number of channels. - - Returns - ------- - audio_parameters : tuple - a tuple for audio parameters as (sampling_rate, sample_width, channels). - """ - err_message = ( - "'{ln}' (or '{sn}') must be a positive integer, found: '{val}'" - ) - parameters = [] - for (long_name, short_name) in ( - ("sampling_rate", "sr"), - ("sample_width", "sw"), - ("channels", "ch"), - ): - param = param_dict.get(long_name, param_dict.get(short_name)) - if param is None or not isinstance(param, int) or param <= 0: - raise AudioParameterError( - err_message.format(ln=long_name, sn=short_name, val=param) - ) - parameters.append(param) - sampling_rate, sample_width, channels = parameters - return sampling_rate, sample_width, channels - - -class AudioSource(ABC): - """ +class AudioSource(): + """ Base class for audio source objects. - - Subclasses should implement methods to open/close and audio stream + + Subclasses should implement methods to open/close and audio stream and read the desired amount of audio samples. - - Parameters - ---------- - sampling_rate : int - number of samples per second of audio data. - sample_width : int - size in bytes of one audio sample. Possible values: 1, 2 or 4. - channels : int - number of channels of audio data. - """ - - def __init__( - self, sampling_rate, sample_width, channels, - ): - - if sample_width not in (1, 2, 4): - raise AudioParameterError( - "Sample width must be one of: 1, 2 or 4 (bytes)" - ) - - self._sampling_rate = sampling_rate - self._sample_width = sample_width - self._channels = channels - + + :Parameters: + + `sampling_rate` : int + Number of samples per second of audio stream. Default = 16000. + + `sample_width` : int + Size in bytes of one audio sample. Possible values : 1, 2, 4. + Default = 2. + + `channels` : int + Number of channels of audio stream. The current version supports + only mono audio streams (i.e. one channel). + """ + + __metaclass__ = ABCMeta + + def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, + sample_width = DEFAULT_SAMPLE_WIDTH, + channels = DEFAULT_NB_CHANNELS): + + if not sample_width in (1, 2, 4): + raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)") + + if channels != 1: + raise ValueError("Only mono audio is currently handled") + + self.sampling_rate = sampling_rate + self.sample_width = sample_width + self.channels = channels + @abstractmethod def is_open(self): - """Return True if audio source is open, False otherwise.""" - + """ Return True if audio source is open, False otherwise """ + @abstractmethod def open(self): - """Open audio source.""" - + """ Open audio source """ + @abstractmethod def close(self): - """Close audio source.""" - + """ Close audio source """ + @abstractmethod def read(self, size): """ Read and return `size` audio samples at most. - - Parameters - ----------- - size : int - Number of samples to read. - - Returns - ------- - data : bytes - Audio data as a bytes object of length `N * sample_width * channels` - where `N` equals: - - - `size` if `size` <= remaining samples - - - remaining samples if `size` > remaining samples - """ - - @property - def sampling_rate(self): - """Number of samples per second of audio stream.""" - return self._sampling_rate - - @property - def sr(self): - """Number of samples per second of audio stream (alias for - `sampling_rate)`.""" - return self._sampling_rate - - @property - def sample_width(self): - """Number of bytes used to represent one audio sample.""" - return self._sample_width - - @property - def sw(self): - """Number of bytes used to represent one audio sample (alias for - `sample_width`).""" - return self._sample_width - - @property - def channels(self): - """Number of channels in audio stream.""" - return self._channels - - @property - def ch(self): - """Number of channels in audio stream (alias for `channels`).""" + + :Parameters: + + `size` : int + the number of samples to read. + + :Returns: + + Audio data as a string of length 'N' * 'smaple_width' * 'channels', where 'N' is: + + - `size` if `size` < 'left_samples' + + - 'left_samples' if `size` > 'left_samples' + + """ + + def get_sampling_rate(self): + """ Return the number of samples per second of audio stream """ + return self.sampling_rate + + def get_sample_width(self): + """ Return the number of bytes used to represent one audio sample """ + return self.sample_width + + def get_channels(self): + """ Return the number of channels of this audio source """ return self.channels + -class Rewindable(AudioSource): +class Rewindable(): """ Base class for rewindable audio streams. - - Subclasses should implement a method to return back to the start of an the - stream (`rewind`), as well as a property getter/setter named `position` that - reads/sets stream position expressed in number of samples. + Subclasses should implement methods to return to the beginning of an + audio stream as well as method to move to an absolute audio position + expressed in time or in number of samples. """ - + + __metaclass__ = ABCMeta + @abstractmethod def rewind(self): - """Go back to the beginning of audio stream.""" - - @property + """ Go back to the beginning of audio stream """ + pass + @abstractmethod - def position(self): - """Return stream position in number of samples.""" - - @position.setter + def get_position(self): + """ Return the total number of already read samples """ + @abstractmethod - def position(self, position): - """Set stream position in number of samples.""" - - @property - def position_s(self): - """Return stream position in seconds.""" - return self.position / self.sampling_rate - - @position_s.setter - def position_s(self, position_s): - """Set stream position in seconds.""" - self.position = int(self.sampling_rate * position_s) - - @property - def position_ms(self): - """Return stream position in milliseconds.""" - return (self.position * 1000) // self.sampling_rate - - @position_ms.setter - def position_ms(self, position_ms): - """Set stream position in milliseconds.""" - if not isinstance(position_ms, int): - raise ValueError("position_ms should be an int") - self.position = int(self.sampling_rate * position_ms / 1000) + def get_time_position(self): + """ Return the total duration in seconds of already read data """ + + @abstractmethod + def set_position(self, position): + """ Move to an absolute position + + :Parameters: + + `position` : int + number of samples to skip from the start of the stream + """ + + @abstractmethod + def set_time_position(self, time_position): + """ Move to an absolute position expressed in seconds + + :Parameters: + + `time_position` : float + seconds to skip from the start of the stream + """ + pass + -class BufferAudioSource(Rewindable): +class BufferAudioSource(AudioSource, Rewindable): """ - An `AudioSource` that encapsulates and reads data from a memory buffer. - - This class implements the `Rewindable` interface. - Parameters - ---------- - data : bytes - audio data - sampling_rate : int, default: 16000 - number of samples per second of audio data. - sample_width : int, default: 2 - size in bytes of one audio sample. Possible values: 1, 2 or 4. - channels : int, default: 1 - number of channels of audio data. + An :class:`AudioSource` that encapsulates and reads data from a memory buffer. + It implements methods from :class:`Rewindable` and is therefore a navigable :class:`AudioSource`. """ - - def __init__( - self, data, sampling_rate=16000, sample_width=2, channels=1, - ): + + def __init__(self, data_buffer, + sampling_rate = DEFAULT_SAMPLE_RATE, + sample_width = DEFAULT_SAMPLE_WIDTH, + channels = DEFAULT_NB_CHANNELS): + + if len(data_buffer) % (sample_width * channels) !=0: + raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") + AudioSource.__init__(self, sampling_rate, sample_width, channels) - check_audio_data(data, sample_width, channels) - self._data = data - self._sample_size_all_channels = sample_width * channels - self._current_position_bytes = 0 + self._buffer = data_buffer + self._index = 0 + self._left = 0 if self._buffer is None else len(self._buffer) self._is_open = False - + def is_open(self): return self._is_open - + def open(self): self._is_open = True - + def close(self): self._is_open = False self.rewind() - + def read(self, size): if not self._is_open: - raise AudioIOError("Stream is not open") - if size is None or size < 0: - offset = None - else: - bytes_to_read = self._sample_size_all_channels * size - offset = self._current_position_bytes + bytes_to_read - data = self._data[self._current_position_bytes : offset] - if data: - self._current_position_bytes += len(data) + raise IOError("Stream is not open") + + if self._left > 0: + + to_read = size * self.sample_width * self.channels + if to_read > self._left: + to_read = self._left + + data = self._buffer[self._index: self._index + to_read] + self._index += to_read + self._left -= to_read + return data + return None - - @property - def data(self): - """Get raw audio data as a `bytes` object.""" - return self._data - + + def get_data_buffer(self): + """ Return all audio data as one string buffer. """ + return self._buffer + + def set_data(self, data_buffer): + """ Set new data for this audio stream. + + :Parameters: + + `data_buffer` : str, basestring, Bytes + a string buffer with a length multiple of (sample_width * channels) + """ + if len(data_buffer) % (self.sample_width * self.channels) !=0: + raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") + self._buffer = data_buffer + self._index = 0 + self._left = 0 if self._buffer is None else len(self._buffer) + + def append_data(self, data_buffer): + """ Append data to this audio stream + + :Parameters: + + `data_buffer` : str, basestring, Bytes + a buffer with a length multiple of (sample_width * channels) + """ + + if len(data_buffer) % (self.sample_width * self.channels) !=0: + raise ValueError("length of data_buffer must be a multiple of (sample_width * channels)") + + self._buffer += data_buffer + self._left += len(data_buffer) + + def rewind(self): - self.position = 0 - - @property - def position(self): - """Get stream position in number of samples""" - return self._current_position_bytes // self._sample_size_all_channels - - @position.setter - def position(self, position): - """Set stream position in number of samples.""" - position *= self._sample_size_all_channels + self.set_position(0) + + def get_position(self): + return self._index / self.sample_width + + def get_time_position(self): + return float(self._index) / (self.sample_width * self.sampling_rate) + + def set_position(self, position): if position < 0: - position += len(self.data) - if position < 0 or position > len(self.data): - raise IndexError("Position out of range") - self._current_position_bytes = position + raise ValueError("position must be >= 0") + + if self._buffer is None: + self._index = 0 + self._left = 0 + return + + position *= self.sample_width + self._index = position if position < len(self._buffer) else len(self._buffer) + self._left = len(self._buffer) - self._index - @property - def position_ms(self): - """Get stream position in milliseconds.""" - return (self._current_position_bytes * 1000) // ( - self._sample_size_all_channels * self.sampling_rate - ) - @position_ms.setter - def position_ms(self, position_ms): - """Set stream position in milliseconds.""" - if not isinstance(position_ms, int): - raise ValueError("position_ms should be an int") - self.position = int(self.sampling_rate * position_ms / 1000) + def set_time_position(self, time_position): # time in seconds + position = int(self.sampling_rate * time_position) + self.set_position(position) -class FileAudioSource(AudioSource): - """ - Base class `AudioSource`s that read audio data from a file. - Parameters - ---------- - sampling_rate : int, default: 16000 - number of samples per second of audio data. - sample_width : int, default: 2 - size in bytes of one audio sample. Possible values: 1, 2 or 4. - channels : int, default: 1 - number of channels of audio data. +class WaveAudioSource(AudioSource): """ - - def __init__(self, sampling_rate, sample_width, channels): - AudioSource.__init__(self, sampling_rate, sample_width, channels) + A class for an `AudioSource` that reads data from a wave file. + + :Parameters: + + `filename` : + path to a valid wave file + """ + + def __init__(self, filename): + + self._filename = filename self._audio_stream = None - - def __del__(self): - if self.is_open(): - self.close() - + + stream = wave.open(self._filename) + AudioSource.__init__(self, stream.getframerate(), + stream.getsampwidth(), + stream.getnchannels()) + stream.close() + + def is_open(self): return self._audio_stream is not None - + + def open(self): + if(self._audio_stream is None): + self._audio_stream = wave.open(self._filename) + + def close(self): if self._audio_stream is not None: self._audio_stream.close() self._audio_stream = None - - @abstractmethod - def _read_from_stream(self, size): - """Read data from stream""" - + + def read(self, size): - if not self.is_open(): - raise AudioIOError("Audio stream is not open") - data = self._read_from_stream(size) - if not data: - return None - return data - - -class RawAudioSource(FileAudioSource): - """ - A class for an `AudioSource` that reads data from a raw (headerless) audio - file. - - This class should be used for large raw audio files to avoid loading the - whole data to memory. - - Parameters - ---------- - filename : str - path to a raw audio file. - sampling_rate : int - Number of samples per second of audio data. - sample_width : int - Size in bytes of one audio sample. Possible values : 1, 2, 4. - channels : int - Number of channels of audio data. - """ - - def __init__(self, file, sampling_rate, sample_width, channels): - FileAudioSource.__init__(self, sampling_rate, sample_width, channels) - self._file = file - self._audio_stream = None - self._sample_size = sample_width * channels - - def open(self): if self._audio_stream is None: - self._audio_stream = open(self._file, "rb") - - def _read_from_stream(self, size): - if size is None or size < 0: - bytes_to_read = None + raise IOError("Stream is not open") else: - bytes_to_read = size * self._sample_size - data = self._audio_stream.read(bytes_to_read) - return data - - -class WaveAudioSource(FileAudioSource): - """ - A class for an `AudioSource` that reads data from a wave file. - - This class should be used for large wave files to avoid loading the whole - data to memory. - - Parameters - ---------- - filename : str - path to a valid wave file. - """ - - def __init__(self, filename): - self._filename = filename - self._audio_stream = None - stream = wave.open(self._filename, "rb") - FileAudioSource.__init__( - self, - stream.getframerate(), - stream.getsampwidth(), - stream.getnchannels(), - ) - stream.close() - - def open(self): - if self._audio_stream is None: - self._audio_stream = wave.open(self._filename) - - def _read_from_stream(self, size): - if size is None or size < 0: - size = -1 - return self._audio_stream.readframes(size) + data = self._audio_stream.readframes(size) + if data is None or len(data) < 1: + return None + return data class PyAudioSource(AudioSource): """ - A class for an `AudioSource` that reads data from built-in microphone using - PyAudio (https://people.csail.mit.edu/hubert/pyaudio/). - - Parameters - ---------- - sampling_rate : int, default: 16000 - number of samples per second of audio data. - sample_width : int, default: 2 - size in bytes of one audio sample. Possible values: 1, 2 or 4. - channels : int, default: 1 - number of channels of audio data. - frames_per_buffer : int, default: 1024 - PyAudio number of frames per buffer. - input_device_index: None or int, default: None - PyAudio index of audio device to read audio data from. If None default - device is used. + A class for an `AudioSource` that reads data the built-in microphone using PyAudio. """ - - def __init__( - self, - sampling_rate=16000, - sample_width=2, - channels=1, - frames_per_buffer=1024, - input_device_index=None, - ): - + + def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, + sample_width = DEFAULT_SAMPLE_WIDTH, + channels = DEFAULT_NB_CHANNELS, + frames_per_buffer = 1024): + + AudioSource.__init__(self, sampling_rate, sample_width, channels) self._chunk_size = frames_per_buffer - self.input_device_index = input_device_index - + import pyaudio - self._pyaudio_object = pyaudio.PyAudio() - self._pyaudio_format = self._pyaudio_object.get_format_from_width( - self.sample_width - ) + self._pyaudio_format = self._pyaudio_object.get_format_from_width(self.sample_width) self._audio_stream = None + def is_open(self): return self._audio_stream is not None - + def open(self): - self._audio_stream = self._pyaudio_object.open( - format=self._pyaudio_format, - channels=self.channels, - rate=self.sampling_rate, - input=True, - output=False, - input_device_index=self.input_device_index, - frames_per_buffer=self._chunk_size, - ) - + self._audio_stream = self._pyaudio_object.open(format = self._pyaudio_format, + channels = self.channels, + rate = self.sampling_rate, + input = True, + output = False, + frames_per_buffer = self._chunk_size) + + def close(self): if self._audio_stream is not None: self._audio_stream.stop_stream() self._audio_stream.close() self._audio_stream = None - + + def read(self, size): if self._audio_stream is None: raise IOError("Stream is not open") + if self._audio_stream.is_active(): data = self._audio_stream.read(size) if data is None or len(data) < 1: return None return data + return None + - -class StdinAudioSource(FileAudioSource): +class StdinAudioSource(AudioSource): """ - A class for an `AudioSource` that reads data from standard input. - - Parameters - ---------- - sampling_rate : int, default: 16000 - number of samples per second of audio data. - sample_width : int, default: 2 - size in bytes of one audio sample. Possible values: 1, 2 or 4. - channels : int, default: 1 - number of channels of audio data. + A class for an :class:`AudioSource` that reads data from standard input. """ - - def __init__( - self, sampling_rate=16000, sample_width=2, channels=1, - ): - FileAudioSource.__init__(self, sampling_rate, sample_width, channels) + + def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, + sample_width = DEFAULT_SAMPLE_WIDTH, + channels = DEFAULT_NB_CHANNELS): + + AudioSource.__init__(self, sampling_rate, sample_width, channels) self._is_open = False - self._sample_size = sample_width * channels - self._stream = sys.stdin.buffer - + + def is_open(self): return self._is_open - + def open(self): self._is_open = True - + def close(self): self._is_open = False - - def _read_from_stream(self, size): - bytes_to_read = size * self._sample_size - data = self._stream.read(bytes_to_read) - if data: - return data - return None - - -def _make_tqdm_progress_bar(iterable, total, duration, **tqdm_kwargs): - fmt = tqdm_kwargs.get("bar_format", DEFAULT_BAR_FORMAT_TQDM) - fmt = fmt.replace("{duration}", "{:.3f}".format(duration)) - tqdm_kwargs["bar_format"] = fmt - - tqdm_kwargs["ncols"] = tqdm_kwargs.get("ncols", DEFAULT_NCOLS_TQDM) - tqdm_kwargs["mininterval"] = tqdm_kwargs.get( - "mininterval", DEFAULT_MIN_INTERVAL_TQDM - ) - return _tqdm(iterable, total=total, **tqdm_kwargs) - - -class PyAudioPlayer: + + def read(self, size): + if not self._is_open: + raise IOError("Stream is not open") + + to_read = size * self.sample_width * self.channels + data = sys.stdin.read(to_read) + + if data is None or len(data) < 1: + return None + + return data + + +class PyAudioPlayer(): """ A class for audio playback using Pyaudio - (https://people.csail.mit.edu/hubert/pyaudio/). - - Parameters - ---------- - sampling_rate : int, default: 16000 - number of samples per second of audio data. - sample_width : int, default: 2 - size in bytes of one audio sample. Possible values: 1, 2 or 4. - channels : int, default: 1 - number of channels of audio data. """ - - def __init__( - self, sampling_rate=16000, sample_width=2, channels=1, - ): - if sample_width not in (1, 2, 4): - raise ValueError("Sample width in bytes must be one of 1, 2 or 4") - + + def __init__(self, sampling_rate = DEFAULT_SAMPLE_RATE, + sample_width = DEFAULT_SAMPLE_WIDTH, + channels = DEFAULT_NB_CHANNELS): + if not sample_width in (1, 2, 4): + raise ValueError("Sample width must be one of: 1, 2 or 4 (bytes)") + self.sampling_rate = sampling_rate self.sample_width = sample_width self.channels = channels - + import pyaudio - self._p = pyaudio.PyAudio() - self.stream = self._p.open( - format=self._p.get_format_from_width(self.sample_width), - channels=self.channels, - rate=self.sampling_rate, - input=False, - output=True, - ) - - def play(self, data, progress_bar=False, **progress_bar_kwargs): - chunk_gen, nb_chunks = self._chunk_data(data) - if progress_bar and _WITH_TQDM: - duration = len(data) / ( - self.sampling_rate * self.sample_width * self.channels - ) - chunk_gen = _make_tqdm_progress_bar( - chunk_gen, - total=nb_chunks, - duration=duration, - **progress_bar_kwargs - ) + self.stream = self._p.open(format = self._p.get_format_from_width(self.sample_width), + channels = self.channels, rate = self.sampling_rate, + input = False, output = True) + + def play(self, data): if self.stream.is_stopped(): self.stream.start_stream() - try: - for chunk in chunk_gen: - self.stream.write(chunk) - except KeyboardInterrupt: - pass + + for chunk in self._chunk_data(data): + self.stream.write(chunk) + self.stream.stop_stream() - - def stop(self): + + def stop(self): if not self.stream.is_stopped(): self.stream.stop_stream() self.stream.close() self._p.terminate() - + def _chunk_data(self, data): # make audio chunks of 100 ms to allow interruption (like ctrl+c) - bytes_1_sec = self.sampling_rate * self.sample_width * self.channels - chunk_size = bytes_1_sec // 10 - # make sure chunk_size is a multiple of sample_width * channels - chunk_size -= chunk_size % (self.sample_width * self.channels) - nb_chunks, rest = divmod(len(data), chunk_size) - if rest > 0: - nb_chunks += 1 - chunk_gen = ( - data[i : i + chunk_size] for i in range(0, len(data), chunk_size) - ) - return chunk_gen, nb_chunks - - -def player_for(source): - """ - Return an `AudioPlayer` compatible with `source` (i.e., has the same - sampling rate, sample width and number of channels). - - Parameters - ---------- - source : AudioSource - An object that has `sampling_rate`, `sample_width` and `sample_width` - attributes. - - Returns - ------- - player : PyAudioPlayer - An audio player that has the same sampling rate, sample width - and number of channels as `source`. - """ - return PyAudioPlayer( - source.sampling_rate, source.sample_width, source.channels - ) - - -def get_audio_source(input=None, **kwargs): - """ - Create and return an AudioSource from input. - - Parameters - ---------- - input : str, bytes, "-" or None (default) - source to read audio data from. If `str`, it should be a path to a valid - audio file. If `bytes`, it is used as raw audio data. If it is "-", - raw data will be read from stdin. If None, read audio data from the - microphone using PyAudio. - kwargs - audio parameters used to build the `AudioSource` object. Depending on - the nature of `input`, theses may be omitted (e.g., when `input` is an - audio file in a popular audio format such as wav, ogg, flac, etc.) or - include parameters such as `sampling_rate`, `sample_width`, `channels` - (or their respective short name versions `sr`, `sw` and `ch`) if `input` - is a path to a raw (headerless) audio file, a bytes object for raw audio - data or None (to read data from built-in microphone). See the respective - `AudioSource` classes from more information about possible parameters. - - Returns - ------- - source : AudioSource - audio source created from input parameters - """ - if input == "-": - return StdinAudioSource(*_get_audio_parameters(kwargs)) + chunk_size = int((self.sampling_rate * self.sample_width * self.channels) / 10) + start = 0 + while start < len(data): + yield data[start : start + chunk_size] + start += chunk_size + + +def from_file(filename): + """ + Create an `AudioSource` object using the audio file specified by `filename`. + The appropriate :class:`AudioSource` class is guessed from file's extension. + + :Parameters: + + `filename` : + path to an audio file. + + :Returns: + + an `AudioSource` object that reads data from the given file. + + """ + + if filename.lower().endswith(".wav"): + return WaveAudioSource(filename) + + raise Exception("Can not create an AudioSource object from '%s'" %(filename)) + + +def player_for(audio_source): + """ + Return a :class:`PyAudioPlayer` that can play data from `audio_source`. + + :Parameters: + + `audio_source` : + an `AudioSource` object. + + :Returns: + + `PyAudioPlayer` that has the same sampling rate, sample width and number of channels + as `audio_source`. + """ + + return PyAudioPlayer(audio_source.get_sampling_rate(), + audio_source.get_sample_width(), + audio_source.get_channels()) + + - if isinstance(input, bytes): - return BufferAudioSource(input, *_get_audio_parameters(kwargs)) - - # read data from a file - if input is not None: - return from_file(filename=input, **kwargs) - - # read data from microphone via pyaudio - else: - frames_per_buffer = kwargs.get("frames_per_buffer", 1024) - input_device_index = kwargs.get("input_device_index") - return PyAudioSource( - *_get_audio_parameters(kwargs), - frames_per_buffer=frames_per_buffer, - input_device_index=input_device_index - ) - - -def _load_raw(file, sampling_rate, sample_width, channels, large_file=False): - """ - Load a raw audio file with standard Python. If `large_file` is True, return - a `RawAudioSource` object that reads data lazily from disk, otherwise load - all data to memory and return a `BufferAudioSource` object. - - Parameters - ---------- - file : str - path to a raw audio data file. - sampling_rate : int - sampling rate of audio data. - sample_width : int - size in bytes of one audio sample. - channels : int - number of channels of audio data. - large_file : bool - if True, return a `RawAudioSource` otherwise a `BufferAudioSource` - object. - - Returns - ------- - source : RawAudioSource or BufferAudioSource - an `AudioSource` that reads data from input file. - """ - if None in (sampling_rate, sample_width, channels): - raise AudioParameterError( - "All audio parameters are required for raw audio files" - ) - - if large_file: - return RawAudioSource( - file, - sampling_rate=sampling_rate, - sample_width=sample_width, - channels=channels, - ) - - with open(file, "rb") as fp: - data = fp.read() - return BufferAudioSource( - data, - sampling_rate=sampling_rate, - sample_width=sample_width, - channels=channels, - ) - - -def _load_wave(file, large_file=False): - """ - Load a wave audio file with standard Python. If `large_file` is True, return - a `WaveAudioSource` object that reads data lazily from disk, otherwise load - all data to memory and return a `BufferAudioSource` object. - - Parameters - ---------- - file : str - path to a wav audio data file - large_file : bool - if True, return a `WaveAudioSource` otherwise a `BufferAudioSource` - object. - - Returns - ------- - source : WaveAudioSource or BufferAudioSource - an `AudioSource` that reads data from input file. - """ - if large_file: - return WaveAudioSource(file) - with wave.open(file) as fp: - channels = fp.getnchannels() - srate = fp.getframerate() - swidth = fp.getsampwidth() - data = fp.readframes(-1) - return BufferAudioSource( - data, sampling_rate=srate, sample_width=swidth, channels=channels - ) - - -def _load_with_pydub(file, audio_format=None): - """ - Open compressed audio or video file using pydub. If a video file - is passed, its audio track(s) are extracted and loaded. - - Parameters - ---------- - file : str - path to audio file. - audio_format : str, default: None - string, audio/video file format if known (e.g. raw, webm, wav, ogg) - - Returns - ------- - source : BufferAudioSource - an `AudioSource` that reads data from input file. - """ - func_dict = { - "mp3": AudioSegment.from_mp3, - "ogg": AudioSegment.from_ogg, - "flv": AudioSegment.from_flv, - } - open_function = func_dict.get(audio_format, AudioSegment.from_file) - segment = open_function(file) - return BufferAudioSource( - data=segment.raw_data, - sampling_rate=segment.frame_rate, - sample_width=segment.sample_width, - channels=segment.channels, - ) - - -def from_file(filename, audio_format=None, large_file=False, **kwargs): - """ - Read audio data from `filename` and return an `AudioSource` object. - if `audio_format` is None, the appropriate `AudioSource` class is guessed - from file's extension. `filename` can be a compressed audio or video file. - This will require installing `pydub` (https://github.com/jiaaro/pydub). - - The normal behavior is to load all audio data to memory from which a - :class:`BufferAudioSource` object is created. This should be convenient - most of the time unless audio file is very large. In that case, and - in order to load audio data in lazy manner (i.e. read data from disk each - time :func:`AudioSource.read` is called), `large_file` should be True. - - Note that the current implementation supports only wave and raw formats for - lazy audio loading. - - If an audio format is `raw`, the following keyword arguments are required: - - - `sampling_rate`, `sr`: int, sampling rate of audio data. - - `sample_width`, `sw`: int, size in bytes of one audio sample. - - `channels`, `ch`: int, number of channels of audio data. - - See also - -------- - :func:`to_file`. - - Parameters - ---------- - filename : str - path to input audio or video file. - audio_format : str - audio format used to save data (e.g. raw, webm, wav, ogg). - large_file : bool, default: False - if True, audio won't fully be loaded to memory but only when a window - is read from disk. - - - Other Parameters - ---------------- - sampling_rate, sr: int - sampling rate of audio data - sample_width : int - sample width (i.e. number of bytes used to represent one audio sample) - channels : int - number of channels of audio data - - Returns - ------- - audio_source : AudioSource - an :class:`AudioSource` object that reads data from input file. - - Raises - ------ - `AudioIOError` - raised if audio data cannot be read in the given - format or if `format` is `raw` and one or more audio parameters are missing. - """ - audio_format = _guess_audio_format(audio_format, filename) - - if audio_format == "raw": - srate, swidth, channels = _get_audio_parameters(kwargs) - return _load_raw(filename, srate, swidth, channels, large_file) - - if audio_format in ["wav", "wave"]: - return _load_wave(filename, large_file) - if large_file: - err_msg = "if 'large_file` is True file format should be raw or wav" - raise AudioIOError(err_msg) - if _WITH_PYDUB: - return _load_with_pydub(filename, audio_format=audio_format) - else: - raise AudioIOError( - "pydub is required for audio formats other than raw or wav" - ) - - -def _save_raw(data, file): - """ - Saves audio data as a headerless (i.e. raw) file. - See also :func:`to_file`. - """ - with open(file, "wb") as fp: - fp.write(data) - - -def _save_wave(data, file, sampling_rate, sample_width, channels): - """ - Saves audio data to a wave file. - See also :func:`to_file`. - """ - if None in (sampling_rate, sample_width, channels): - raise AudioParameterError( - "All audio parameters are required to save wave audio files" - ) - with wave.open(file, "w") as fp: - fp.setframerate(sampling_rate) - fp.setsampwidth(sample_width) - fp.setnchannels(channels) - fp.writeframes(data) - - -def _save_with_pydub( - data, file, audio_format, sampling_rate, sample_width, channels -): - """ - Saves audio data with pydub (https://github.com/jiaaro/pydub). - See also :func:`to_file`. - """ - segment = AudioSegment( - data, - frame_rate=sampling_rate, - sample_width=sample_width, - channels=channels, - ) - with open(file, "wb") as fp: - segment.export(fp, format=audio_format) - - -def to_file(data, file, audio_format=None, **kwargs): - """ - Writes audio data to file. If `audio_format` is `None`, output - audio format will be guessed from extension. If `audio_format` - is `None` and `file` comes without an extension then audio - data will be written as a raw audio file. - - Parameters - ---------- - data : bytes-like - audio data to be written. Can be a `bytes`, `bytearray`, - `memoryview`, `array` or `numpy.ndarray` object. - file : str - path to output audio file. - audio_format : str - audio format used to save data (e.g. raw, webm, wav, ogg) - kwargs: dict - If an audio format other than `raw` is used, the following keyword - arguments are required: - - - `sampling_rate`, `sr`: int, sampling rate of audio data. - - `sample_width`, `sw`: int, size in bytes of one audio sample. - - `channels`, `ch`: int, number of channels of audio data. - - Raises - ------ - `AudioParameterError` if output format is different than raw and one or more - audio parameters are missing. `AudioIOError` if audio data cannot be written - in the desired format. - """ - audio_format = _guess_audio_format(audio_format, file) - if audio_format in (None, "raw"): - _save_raw(data, file) - return - try: - sampling_rate, sample_width, channels = _get_audio_parameters(kwargs) - except AudioParameterError as exc: - err_message = "All audio parameters are required to save formats " - "other than raw. Error detail: {}".format(exc) - raise AudioParameterError(err_message) - if audio_format in ("wav", "wave"): - _save_wave(data, file, sampling_rate, sample_width, channels) - elif _WITH_PYDUB: - _save_with_pydub( - data, file, audio_format, sampling_rate, sample_width, channels - ) - else: - err_message = "cannot write file format {} (file name: {})" - raise AudioIOError(err_message.format(audio_format, file)) diff --git a/libs/auditok/plotting.py b/libs/auditok/plotting.py deleted file mode 100755 index eca5877f4..000000000 --- a/libs/auditok/plotting.py +++ /dev/null @@ -1,150 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np - -AUDITOK_PLOT_THEME = { - "figure": {"facecolor": "#482a36", "alpha": 0.2}, - "plot": {"facecolor": "#282a36"}, - "energy_threshold": { - "color": "#e31f8f", - "linestyle": "--", - "linewidth": 1, - }, - "signal": {"color": "#40d970", "linestyle": "-", "linewidth": 1}, - "detections": { - "facecolor": "#777777", - "edgecolor": "#ff8c1a", - "linewidth": 1, - "alpha": 0.75, - }, -} - - -def _make_time_axis(nb_samples, sampling_rate): - sample_duration = 1 / sampling_rate - x = np.linspace(0, sample_duration * (nb_samples - 1), nb_samples) - return x - - -def _plot_line(x, y, theme, xlabel=None, ylabel=None, **kwargs): - color = theme.get("color", theme.get("c")) - ls = theme.get("linestyle", theme.get("ls")) - lw = theme.get("linewidth", theme.get("lw")) - plt.plot(x, y, c=color, ls=ls, lw=lw, **kwargs) - plt.xlabel(xlabel, fontsize=8) - plt.ylabel(ylabel, fontsize=8) - - -def _plot_detections(subplot, detections, theme): - fc = theme.get("facecolor", theme.get("fc")) - ec = theme.get("edgecolor", theme.get("ec")) - ls = theme.get("linestyle", theme.get("ls")) - lw = theme.get("linewidth", theme.get("lw")) - alpha = theme.get("alpha") - for (start, end) in detections: - subplot.axvspan(start, end, fc=fc, ec=ec, ls=ls, lw=lw, alpha=alpha) - - -def plot( - audio_region, - scale_signal=True, - detections=None, - energy_threshold=None, - show=True, - figsize=None, - save_as=None, - dpi=120, - theme="auditok", -): - y = np.asarray(audio_region) - if len(y.shape) == 1: - y = y.reshape(1, -1) - nb_subplots, nb_samples = y.shape - sampling_rate = audio_region.sampling_rate - time_axis = _make_time_axis(nb_samples, sampling_rate) - if energy_threshold is not None: - eth_log10 = energy_threshold * np.log(10) / 10 - amplitude_threshold = np.sqrt(np.exp(eth_log10)) - else: - amplitude_threshold = None - if detections is None: - detections = [] - else: - # End of detection corresponds to the end of the last sample but - # to stay compatible with the time axis of signal plotting we want end - # of detection to correspond to the *start* of the that last sample. - detections = [ - (start, end - (1 / sampling_rate)) for (start, end) in detections - ] - if theme == "auditok": - theme = AUDITOK_PLOT_THEME - - fig = plt.figure(figsize=figsize, dpi=dpi) - fig_theme = theme.get("figure", theme.get("fig", {})) - fig_fc = fig_theme.get("facecolor", fig_theme.get("ffc")) - fig_alpha = fig_theme.get("alpha", 1) - fig.patch.set_facecolor(fig_fc) - fig.patch.set_alpha(fig_alpha) - - plot_theme = theme.get("plot", {}) - plot_fc = plot_theme.get("facecolor", plot_theme.get("pfc")) - - if nb_subplots > 2 and nb_subplots % 2 == 0: - nb_rows = nb_subplots // 2 - nb_columns = 2 - else: - nb_rows = nb_subplots - nb_columns = 1 - - for sid, samples in enumerate(y, 1): - ax = fig.add_subplot(nb_rows, nb_columns, sid) - ax.set_facecolor(plot_fc) - if scale_signal: - std = samples.std() - if std > 0: - mean = samples.mean() - std = samples.std() - samples = (samples - mean) / std - max_ = samples.max() - plt.ylim(-1.5 * max_, 1.5 * max_) - if amplitude_threshold is not None: - if scale_signal and std > 0: - amp_th = (amplitude_threshold - mean) / std - else: - amp_th = amplitude_threshold - eth_theme = theme.get("energy_threshold", theme.get("eth", {})) - _plot_line( - [time_axis[0], time_axis[-1]], - [amp_th] * 2, - eth_theme, - label="Detection threshold", - ) - if sid == 1: - legend = plt.legend( - ["Detection threshold"], - facecolor=fig_fc, - framealpha=0.1, - bbox_to_anchor=(0.0, 1.15, 1.0, 0.102), - loc=2, - ) - legend = plt.gca().add_artist(legend) - - signal_theme = theme.get("signal", {}) - _plot_line( - time_axis, - samples, - signal_theme, - xlabel="Time (seconds)", - ylabel="Signal{}".format(" (scaled)" if scale_signal else ""), - ) - detections_theme = theme.get("detections", {}) - _plot_detections(ax, detections, detections_theme) - plt.title("Channel {}".format(sid), fontsize=10) - - plt.xticks(fontsize=8) - plt.yticks(fontsize=8) - plt.tight_layout() - - if save_as is not None: - plt.savefig(save_as, dpi=dpi) - if show: - plt.show() diff --git a/libs/auditok/signal.py b/libs/auditok/signal.py deleted file mode 100644 index 3f00fb9e5..000000000 --- a/libs/auditok/signal.py +++ /dev/null @@ -1,179 +0,0 @@ -""" -Module for basic audio signal processing and array operations. - -.. autosummary:: - :toctree: generated/ - - to_array - extract_single_channel - compute_average_channel - compute_average_channel_stereo - separate_channels - calculate_energy_single_channel - calculate_energy_multichannel -""" -from array import array as array_ -import audioop -import math - -FORMAT = {1: "b", 2: "h", 4: "i"} -_EPSILON = 1e-10 - - -def to_array(data, sample_width, channels): - """Extract individual channels of audio data and return a list of arrays of - numeric samples. This will always return a list of `array.array` objects - (one per channel) even if audio data is mono. - - Parameters - ---------- - data : bytes - raw audio data. - sample_width : int - size in bytes of one audio sample (one channel considered). - - Returns - ------- - samples_arrays : list - list of arrays of audio samples. - """ - fmt = FORMAT[sample_width] - if channels == 1: - return [array_(fmt, data)] - return separate_channels(data, fmt, channels) - - -def extract_single_channel(data, fmt, channels, selected): - samples = array_(fmt, data) - return samples[selected::channels] - - -def compute_average_channel(data, fmt, channels): - """ - Compute and return average channel of multi-channel audio data. If the - number of channels is 2, use :func:`compute_average_channel_stereo` (much - faster). This function uses satandard `array` module to convert `bytes` data - into an array of numeric values. - - Parameters - ---------- - data : bytes - multi-channel audio data to mix down. - fmt : str - format (single character) to pass to `array.array` to convert `data` - into an array of samples. This should be "b" if audio data's sample width - is 1, "h" if it's 2 and "i" if it's 4. - channels : int - number of channels of audio data. - - Returns - ------- - mono_audio : bytes - mixed down audio data. - """ - all_channels = array_(fmt, data) - mono_channels = [ - array_(fmt, all_channels[ch::channels]) for ch in range(channels) - ] - avg_arr = array_( - fmt, - (round(sum(samples) / channels) for samples in zip(*mono_channels)), - ) - return avg_arr - - -def compute_average_channel_stereo(data, sample_width): - """Compute and return average channel of stereo audio data. This function - should be used when the number of channels is exactly 2 because in that - case we can use standard `audioop` module which *much* faster then calling - :func:`compute_average_channel`. - - Parameters - ---------- - data : bytes - 2-channel audio data to mix down. - sample_width : int - size in bytes of one audio sample (one channel considered). - - Returns - ------- - mono_audio : bytes - mixed down audio data. - """ - fmt = FORMAT[sample_width] - arr = array_(fmt, audioop.tomono(data, sample_width, 0.5, 0.5)) - return arr - - -def separate_channels(data, fmt, channels): - """Create a list of arrays of audio samples (`array.array` objects), one for - each channel. - - Parameters - ---------- - data : bytes - multi-channel audio data to mix down. - fmt : str - format (single character) to pass to `array.array` to convert `data` - into an array of samples. This should be "b" if audio data's sample width - is 1, "h" if it's 2 and "i" if it's 4. - channels : int - number of channels of audio data. - - Returns - ------- - channels_arr : list - list of audio channels, each as a standard `array.array`. - """ - all_channels = array_(fmt, data) - mono_channels = [ - array_(fmt, all_channels[ch::channels]) for ch in range(channels) - ] - return mono_channels - - -def calculate_energy_single_channel(data, sample_width): - """Calculate the energy of mono audio data. Energy is computed as: - - .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605 - - where `a_i` is the i-th audio sample and `N` is the number of audio samples - in data. - - Parameters - ---------- - data : bytes - single-channel audio data. - sample_width : int - size in bytes of one audio sample. - - Returns - ------- - energy : float - energy of audio signal. - """ - energy_sqrt = max(audioop.rms(data, sample_width), _EPSILON) - return 20 * math.log10(energy_sqrt) - - -def calculate_energy_multichannel(x, sample_width, aggregation_fn=max): - """Calculate the energy of multi-channel audio data. Energy is calculated - channel-wise. An aggregation function is applied to the resulting energies - (default: `max`). Also see :func:`calculate_energy_single_channel`. - - Parameters - ---------- - data : bytes - single-channel audio data. - sample_width : int - size in bytes of one audio sample (one channel considered). - aggregation_fn : callable, default: max - aggregation function to apply to the resulting per-channel energies. - - Returns - ------- - energy : float - aggregated energy of multi-channel audio signal. - """ - energies = (calculate_energy_single_channel(xi, sample_width) for xi in x) - return aggregation_fn(energies) diff --git a/libs/auditok/signal_numpy.py b/libs/auditok/signal_numpy.py deleted file mode 100644 index bf5425197..000000000 --- a/libs/auditok/signal_numpy.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -from .signal import ( - compute_average_channel_stereo, - calculate_energy_single_channel, - calculate_energy_multichannel, -) - -FORMAT = {1: np.int8, 2: np.int16, 4: np.int32} - - -def to_array(data, sample_width, channels): - fmt = FORMAT[sample_width] - if channels == 1: - return np.frombuffer(data, dtype=fmt).astype(np.float64) - return separate_channels(data, fmt, channels).astype(np.float64) - - -def extract_single_channel(data, fmt, channels, selected): - samples = np.frombuffer(data, dtype=fmt) - return np.asanyarray(samples[selected::channels], order="C") - - -def compute_average_channel(data, fmt, channels): - array = np.frombuffer(data, dtype=fmt).astype(np.float64) - return array.reshape(-1, channels).mean(axis=1).round().astype(fmt) - - -def separate_channels(data, fmt, channels): - array = np.frombuffer(data, dtype=fmt) - return np.asanyarray(array.reshape(-1, channels).T, order="C") diff --git a/libs/auditok/util.py b/libs/auditok/util.py index f29eb9bf3..d46a8899c 100644 --- a/libs/auditok/util.py +++ b/libs/auditok/util.py @@ -1,624 +1,448 @@ """ -.. autosummary:: - :toctree: generated/ - - AudioEnergyValidator - AudioReader - Recorder - make_duration_formatter - make_channel_selector -""" -from abc import ABC, abstractmethod -import warnings -from functools import partial -from .io import ( - AudioIOError, - AudioSource, - from_file, - BufferAudioSource, - PyAudioSource, - get_audio_source, -) -from .exceptions import ( - DuplicateArgument, - TooSamllBlockDuration, - TimeFormatError, -) - -try: - from . import signal_numpy as signal -except ImportError: - from . import signal - - -__all__ = [ - "make_duration_formatter", - "make_channel_selector", - "DataSource", - "DataValidator", - "StringDataSource", - "ADSFactory", - "AudioDataSource", - "AudioReader", - "Recorder", - "AudioEnergyValidator", -] - - -def make_duration_formatter(fmt): - """ - Make and return a function used to format durations in seconds. Accepted - format directives are: - - - ``%S`` : absolute number of seconds with 3 decimals. This direction should - be used alone. - - ``%i`` : milliseconds - - ``%s`` : seconds - - ``%m`` : minutes - - ``%h`` : hours - - These last 4 directives should all be specified. They can be placed anywhere - in the input string. - - Parameters - ---------- - fmt : str - duration format. - - Returns - ------- - formatter : callable - a function that takes a duration in seconds (float) and returns a string - that corresponds to that duration. - - Raises - ------ - TimeFormatError - if the format contains an unknown directive. - - Examples - -------- - - Using ``%S``: - - .. code:: python - - formatter = make_duration_formatter("%S") - formatter(123.589) - '123.589' - formatter(123) - '123.000' - - Using the other directives: - - .. code:: python - - formatter = make_duration_formatter("%h:%m:%s.%i") - formatter(3600+120+3.25) - '01:02:03.250' - - formatter = make_duration_formatter("%h hrs, %m min, %s sec and %i ms") - formatter(3600+120+3.25) - '01 hrs, 02 min, 03 sec and 250 ms' - - # omitting one of the 4 directives might result in a wrong duration - formatter = make_duration_formatter("%m min, %s sec and %i ms") - formatter(3600+120+3.25) - '02 min, 03 sec and 250 ms' - """ - if fmt == "%S": - - def fromatter(seconds): - return "{:.3f}".format(seconds) - - elif fmt == "%I": - - def fromatter(seconds): - return "{0}".format(int(seconds * 1000)) - - else: - fmt = fmt.replace("%h", "{hrs:02d}") - fmt = fmt.replace("%m", "{mins:02d}") - fmt = fmt.replace("%s", "{secs:02d}") - fmt = fmt.replace("%i", "{millis:03d}") - try: - i = fmt.index("%") - raise TimeFormatError( - "Unknown time format directive '{0}'".format(fmt[i : i + 2]) - ) - except ValueError: - pass - - def fromatter(seconds): - millis = int(seconds * 1000) - hrs, millis = divmod(millis, 3600000) - mins, millis = divmod(millis, 60000) - secs, millis = divmod(millis, 1000) - return fmt.format(hrs=hrs, mins=mins, secs=secs, millis=millis) - - return fromatter - - -def make_channel_selector(sample_width, channels, selected=None): - """Create and return a callable used for audio channel selection. The - returned selector can be used as `selector(audio_data)` and returns data - that contains selected channel only. - - Importantly, if `selected` is None or equals "any", `selector(audio_data)` - will separate and return a list of available channels: - `[data_channe_1, data_channe_2, ...].` - - Note also that returned `selector` expects `bytes` format for input data but - does notnecessarily return a `bytes` object. In fact, in order to extract - the desired channel (or compute the average channel if `selected` = "avg"), - it first converts input data into a `array.array` (or `numpy.ndarray`) - object. After channel of interst is selected/computed, it is returned as - such, without any reconversion to `bytes`. This behavior is wanted for - efficiency purposes because returned objects can be directly used as buffers - of bytes. In any case, returned objects can be converted back to `bytes` - using `bytes(obj)`. +Class summary +============= - Exception to this is the special case where `channels` = 1 in which input - data is returned without any processing. - - - Parameters - ---------- - sample_width : int - number of bytes used to encode one audio sample, should be 1, 2 or 4. - channels : int - number of channels of raw audio data that the returned selector should - expect. - selected : int or str, default: None - audio channel to select and return when calling `selector(raw_data)`. It - should be an int >= `-channels` and < `channels`. If one of "mix", - "avg" or "average" is passed then `selector` will return the average - channel of audio data. If None or "any", return a list of all available - channels at each call. - - Returns - ------- - selector : callable - a callable that can be used as `selector(audio_data)` and returns data - that contains channel of interst. - - Raises - ------ - ValueError - if `sample_width` is not one of 1, 2 or 4, or if `selected` has an - unexpected value. - """ - fmt = signal.FORMAT.get(sample_width) - if fmt is None: - err_msg = "'sample_width' must be 1, 2 or 4, given: {}" - raise ValueError(err_msg.format(sample_width)) - if channels == 1: - return lambda x: x - - if isinstance(selected, int): - if selected < 0: - selected += channels - if selected < 0 or selected >= channels: - err_msg = "Selected channel must be >= -channels and < channels" - err_msg += ", given: {}" - raise ValueError(err_msg.format(selected)) - return partial( - signal.extract_single_channel, - fmt=fmt, - channels=channels, - selected=selected, - ) +.. autosummary:: - if selected in ("mix", "avg", "average"): - if channels == 2: - # when data is stereo, using audioop when possible is much faster - return partial( - signal.compute_average_channel_stereo, - sample_width=sample_width, - ) + DataSource + StringDataSource + ADSFactory + ADSFactory.AudioDataSource + ADSFactory.ADSDecorator + ADSFactory.OverlapADS + ADSFactory.LimiterADS + ADSFactory.RecorderADS + DataValidator + AudioEnergyValidator - return partial( - signal.compute_average_channel, fmt=fmt, channels=channels - ) +""" - if selected in (None, "any"): - return partial(signal.separate_channels, fmt=fmt, channels=channels) - raise ValueError( - "Selected channel must be an integer, None (alias 'any') or 'average' " - "(alias 'avg' or 'mix')" - ) +from abc import ABCMeta, abstractmethod +import math +from array import array +from .io import Rewindable, from_file, BufferAudioSource, PyAudioSource +from .exceptions import DuplicateArgument +import sys -class DataSource(ABC): +try: + import numpy + _WITH_NUMPY = True +except ImportError as e: + _WITH_NUMPY = False + +try: + from builtins import str + basestring = str +except ImportError as e: + if sys.version_info >= (3, 0): + basestring = str + + + +__all__ = ["DataSource", "DataValidator", "StringDataSource", "ADSFactory", "AudioEnergyValidator"] + + +class DataSource(): """ - Base class for objects passed to :func:`StreamTokenizer.tokenize`. + Base class for objects passed to :func:`auditok.core.StreamTokenizer.tokenize`. Subclasses should implement a :func:`DataSource.read` method. """ - + __metaclass__ = ABCMeta + @abstractmethod def read(self): """ - Read a block (i.e., window) of data read from this source. + Read a piece of data read from this source. If no more data is available, return None. """ - - -class DataValidator(ABC): + + +class DataValidator(): """ - Base class for a validator object used by :class:`.core.StreamTokenizer` - to check if read data is valid. + Base class for a validator object used by :class:`.core.StreamTokenizer` to check + if read data is valid. Subclasses should implement :func:`is_valid` method. """ - + __metaclass__ = ABCMeta + @abstractmethod def is_valid(self, data): """ Check whether `data` is valid """ - -class AudioEnergyValidator(DataValidator): - """ - A validator based on audio signal energy. For an input window of `N` audio - samples (see :func:`AudioEnergyValidator.is_valid`), the energy is computed - as: - - .. math:: energy = 20 \log(\sqrt({1}/{N}\sum_{i}^{N}{a_i}^2)) % # noqa: W605 - - where `a_i` is the i-th audio sample. - - Parameters - ---------- - energy_threshold : float - minimum energy that audio window should have to be valid. - sample_width : int - size in bytes of one audio sample. - channels : int - number of channels of audio data. - use_channel : {None, "any", "mix", "avg", "average"} or int - channel to use for energy computation. The following values are - accepted: - - - None (alias "any") : compute energy for each of the channels and return - the maximum value. - - "mix" (alias "avg" or "average") : compute the average channel then - compute its energy. - - int (>= 0 , < `channels`) : compute the energy of the specified channel - and ignore the other ones. - - Returns - ------- - energy : float - energy of the audio window. - """ - - def __init__( - self, energy_threshold, sample_width, channels, use_channel=None - ): - self._sample_width = sample_width - self._selector = make_channel_selector( - sample_width, channels, use_channel - ) - if channels == 1 or use_channel not in (None, "any"): - self._energy_fn = signal.calculate_energy_single_channel - else: - self._energy_fn = signal.calculate_energy_multichannel - self._energy_threshold = energy_threshold - - def is_valid(self, data): - """ - - Parameters - ---------- - data : bytes-like - array of raw audio data - - Returns - ------- - bool - True if the energy of audio data is >= threshold, False otherwise. - """ - log_energy = self._energy_fn(self._selector(data), self._sample_width) - return log_energy >= self._energy_threshold - - class StringDataSource(DataSource): """ - Class that represent a :class:`DataSource` as a string buffer. - Each call to :func:`DataSource.read` returns on character and moves one - step forward. If the end of the buffer is reached, :func:`read` returns - None. - - Parameters - ---------- - data : str - a string object used as data. - + A class that represent a :class:`DataSource` as a string buffer. + Each call to :func:`DataSource.read` returns on character and moves one step forward. + If the end of the buffer is reached, :func:`read` returns None. + + :Parameters: + + `data` : + a basestring object. + """ - + def __init__(self, data): self._data = None self._current = 0 self.set_data(data) - + + def read(self): """ Read one character from buffer. - - Returns - ------- - char : str - current character or None if end of buffer is reached. + + :Returns: + + Current character or None if end of buffer is reached """ - + if self._current >= len(self._data): return None self._current += 1 return self._data[self._current - 1] - + def set_data(self, data): """ Set a new data buffer. - - Parameters - ---------- - data : str - new data buffer. + + :Parameters: + + `data` : a basestring object + New data buffer. """ - - if not isinstance(data, str): - raise ValueError("data must an instance of str") + + if not isinstance(data, basestring): + raise ValueError("data must an instance of basestring") self._data = data self._current = 0 + class ADSFactory: """ - .. deprecated:: 2.0.0 - `ADSFactory` will be removed in auditok 2.0.1, use instances of - :class:`AudioReader` instead. - - Factory class that makes it easy to create an - :class:`AudioDataSource` object that implements - :class:`DataSource` and can therefore be passed to - :func:`auditok.core.StreamTokenizer.tokenize`. - - Whether you read audio data from a file, the microphone or a memory buffer, - this factory instantiates and returns the right - :class:`AudioDataSource` object. - - There are many other features you want a :class:`AudioDataSource` object to - have, such as: memorize all read audio data so that you can rewind and reuse - it (especially useful when reading data from the microphone), read a fixed - amount of data (also useful when reading from the microphone), read - overlapping audio frames (often needed when dosing a spectral analysis of - data). - - :func:`ADSFactory.ads` automatically creates and return object with the - desired behavior according to the supplied keyword arguments. + Factory class that makes it easy to create an :class:`ADSFactory.AudioDataSource` object that implements + :class:`DataSource` and can therefore be passed to :func:`auditok.core.StreamTokenizer.tokenize`. + + Whether you read audio data from a file, the microphone or a memory buffer, this factory + instantiates and returns the right :class:`ADSFactory.AudioDataSource` object. + + There are many other features you want your :class:`ADSFactory.AudioDataSource` object to have, such as: + memorize all read audio data so that you can rewind and reuse it (especially useful when + reading data from the microphone), read a fixed amount of data (also useful when reading + from the microphone), read overlapping audio frames (often needed when dosing a spectral + analysis of data). + + :func:`ADSFactory.ads` automatically creates and return object with the desired behavior according + to the supplied keyword arguments. + """ - - @staticmethod # noqa: C901 + + @staticmethod def _check_normalize_args(kwargs): - + for k in kwargs: - if k not in [ - "block_dur", - "hop_dur", - "block_size", - "hop_size", - "max_time", - "record", - "audio_source", - "filename", - "data_buffer", - "frames_per_buffer", - "sampling_rate", - "sample_width", - "channels", - "sr", - "sw", - "ch", - "asrc", - "fn", - "fpb", - "db", - "mt", - "rec", - "bd", - "hd", - "bs", - "hs", - ]: + if not k in ["block_dur", "hop_dur", "block_size", "hop_size", "max_time", "record", + "audio_source", "filename", "data_buffer", "frames_per_buffer", "sampling_rate", + "sample_width", "channels", "sr", "sw", "ch", "asrc", "fn", "fpb", "db", "mt", + "rec", "bd", "hd", "bs", "hs"]: raise ValueError("Invalid argument: {0}".format(k)) - + if "block_dur" in kwargs and "bd" in kwargs: - raise DuplicateArgument( - "Either 'block_dur' or 'bd' must be specified, not both" - ) - + raise DuplicateArgument("Either 'block_dur' or 'bd' must be specified, not both") + if "hop_dur" in kwargs and "hd" in kwargs: - raise DuplicateArgument( - "Either 'hop_dur' or 'hd' must be specified, not both" - ) - + raise DuplicateArgument("Either 'hop_dur' or 'hd' must be specified, not both") + if "block_size" in kwargs and "bs" in kwargs: - raise DuplicateArgument( - "Either 'block_size' or 'bs' must be specified, not both" - ) - + raise DuplicateArgument("Either 'block_size' or 'bs' must be specified, not both") + if "hop_size" in kwargs and "hs" in kwargs: - raise DuplicateArgument( - "Either 'hop_size' or 'hs' must be specified, not both" - ) - + raise DuplicateArgument("Either 'hop_size' or 'hs' must be specified, not both") + if "max_time" in kwargs and "mt" in kwargs: - raise DuplicateArgument( - "Either 'max_time' or 'mt' must be specified, not both" - ) - + raise DuplicateArgument("Either 'max_time' or 'mt' must be specified, not both") + if "audio_source" in kwargs and "asrc" in kwargs: - raise DuplicateArgument( - "Either 'audio_source' or 'asrc' must be specified, not both" - ) - + raise DuplicateArgument("Either 'audio_source' or 'asrc' must be specified, not both") + if "filename" in kwargs and "fn" in kwargs: - raise DuplicateArgument( - "Either 'filename' or 'fn' must be specified, not both" - ) - + raise DuplicateArgument("Either 'filename' or 'fn' must be specified, not both") + if "data_buffer" in kwargs and "db" in kwargs: - raise DuplicateArgument( - "Either 'filename' or 'db' must be specified, not both" - ) - + raise DuplicateArgument("Either 'filename' or 'db' must be specified, not both") + if "frames_per_buffer" in kwargs and "fbb" in kwargs: - raise DuplicateArgument( - "Either 'frames_per_buffer' or 'fpb' must be specified, not " - "both" - ) - + raise DuplicateArgument("Either 'frames_per_buffer' or 'fpb' must be specified, not both") + if "sampling_rate" in kwargs and "sr" in kwargs: - raise DuplicateArgument( - "Either 'sampling_rate' or 'sr' must be specified, not both" - ) - + raise DuplicateArgument("Either 'sampling_rate' or 'sr' must be specified, not both") + if "sample_width" in kwargs and "sw" in kwargs: - raise DuplicateArgument( - "Either 'sample_width' or 'sw' must be specified, not both" - ) - + raise DuplicateArgument("Either 'sample_width' or 'sw' must be specified, not both") + if "channels" in kwargs and "ch" in kwargs: - raise DuplicateArgument( - "Either 'channels' or 'ch' must be specified, not both" - ) - + raise DuplicateArgument("Either 'channels' or 'ch' must be specified, not both") + if "record" in kwargs and "rec" in kwargs: - raise DuplicateArgument( - "Either 'record' or 'rec' must be specified, not both" - ) - + raise DuplicateArgument("Either 'record' or 'rec' must be specified, not both") + + kwargs["bd"] = kwargs.pop("block_dur", None) or kwargs.pop("bd", None) kwargs["hd"] = kwargs.pop("hop_dur", None) or kwargs.pop("hd", None) kwargs["bs"] = kwargs.pop("block_size", None) or kwargs.pop("bs", None) kwargs["hs"] = kwargs.pop("hop_size", None) or kwargs.pop("hs", None) kwargs["mt"] = kwargs.pop("max_time", None) or kwargs.pop("mt", None) - kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop( - "asrc", None - ) + kwargs["asrc"] = kwargs.pop("audio_source", None) or kwargs.pop("asrc", None) kwargs["fn"] = kwargs.pop("filename", None) or kwargs.pop("fn", None) kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop("db", None) - + record = kwargs.pop("record", False) if not record: record = kwargs.pop("rec", False) if not isinstance(record, bool): raise TypeError("'record' must be a boolean") - + kwargs["rec"] = record - - # keep long names for arguments meant for BufferAudioSource - # and PyAudioSource + + # keep long names for arguments meant for BufferAudioSource and PyAudioSource if "frames_per_buffer" in kwargs or "fpb" in kwargs: - kwargs["frames_per_buffer"] = kwargs.pop( - "frames_per_buffer", None - ) or kwargs.pop("fpb", None) - + kwargs["frames_per_buffer"] = kwargs.pop("frames_per_buffer", None) or kwargs.pop("fpb", None) + if "sampling_rate" in kwargs or "sr" in kwargs: - kwargs["sampling_rate"] = kwargs.pop( - "sampling_rate", None - ) or kwargs.pop("sr", None) - - if "sample_width" in kwargs or "sw" in kwargs: - kwargs["sample_width"] = kwargs.pop( - "sample_width", None - ) or kwargs.pop("sw", None) - + kwargs["sampling_rate"] = kwargs.pop("sampling_rate", None) or kwargs.pop("sr", None) + + if "sample_width" in kwargs or "sw" in kwargs: + kwargs["sample_width"] = kwargs.pop("sample_width", None) or kwargs.pop("sw", None) + if "channels" in kwargs or "ch" in kwargs: - kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop( - "ch", None - ) - + kwargs["channels"] = kwargs.pop("channels", None) or kwargs.pop("ch", None) + + + + + + + @staticmethod def ads(**kwargs): + """ - Create an return an :class:`AudioDataSource`. The type and - behavior of the object is the result - of the supplied parameters. Called without any parameters, the class - will read audio data from the available built-in microphone with the - default parameters. - - Parameters - ---------- - sampling_rate, sr : int, default: 16000 - number of audio samples per second of input audio stream. - sample_width, sw : int, default: 2 - number of bytes per sample, must be one of 1, 2 or 4 - channels, ch : int, default: 1 - number of audio channels, only a value of 1 is currently accepted. - frames_per_buffer, fpb : int, default: 1024 - number of samples of PyAudio buffer. - audio_source, asrc : `AudioSource` - `AudioSource` to read data from - filename, fn : str - create an `AudioSource` object using this file - data_buffer, db : str - build an `io.BufferAudioSource` using data in `data_buffer`. - If this keyword is used, - `sampling_rate`, `sample_width` and `channels` are passed to - `io.BufferAudioSource` constructor and used instead of default - values. - max_time, mt : float - maximum time (in seconds) to read. Default behavior: read until - there is no more data - available. - record, rec : bool, default = False - save all read data in cache. Provide a navigable object which has a - `rewind` method. - block_dur, bd : float - processing block duration in seconds. This represents the quantity - of audio data to return each time the :func:`read` method is - invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling rate - is 8000 and the sample width is 2 bytes, :func:`read` returns a - buffer of 0.025 * 8000 * 2 = 400 bytes at most. This parameter will - be looked for (and used if available) before `block_size`. If - neither parameter is given, `block_dur` will be set to 0.01 second - (i.e. 10 ms) - hop_dur, hd : float - quantity of data to skip from current processing window. if - `hop_dur` is supplied then there will be an overlap of `block_dur` - - `hop_dur` between two adjacent blocks. This parameter will be - looked for (and used if available) before `hop_size`. - If neither parameter is given, `hop_dur` will be set to `block_dur` - which means that there will be no overlap between two consecutively - read blocks. - block_size, bs : int - number of samples to read each time the `read` method is called. - Default: a block size that represents a window of 10ms, so for a - sampling rate of 16000, the default `block_size` is 160 samples, - for a rate of 44100, `block_size` = 441 samples, etc. - hop_size, hs : int - determines the number of overlapping samples between two adjacent - read windows. For a `hop_size` of value *N*, the overlap is - `block_size` - *N*. Default : `hop_size` = `block_size`, means that - there is no overlap. - - Returns - ------- - audio_data_source : AudioDataSource - an `AudioDataSource` object build with input parameters. + Create an return an :class:`ADSFactory.AudioDataSource`. The type and behavior of the object is the result + of the supplied parameters. + + :Parameters: + + *No parameters* : + read audio data from the available built-in microphone with the default parameters. + The returned :class:`ADSFactory.AudioDataSource` encapsulate an :class:`io.PyAudioSource` object and hence + it accepts the next four parameters are passed to use instead of their default values. + + `sampling_rate`, `sr` : *(int)* + number of samples per second. Default = 16000. + + `sample_width`, `sw` : *(int)* + number of bytes per sample (must be in (1, 2, 4)). Default = 2 + + `channels`, `ch` : *(int)* + number of audio channels. Default = 1 (only this value is currently accepted) + + `frames_per_buffer`, `fpb` : *(int)* + number of samples of PyAudio buffer. Default = 1024. + + `audio_source`, `asrc` : an `AudioSource` object + read data from this audio source + + `filename`, `fn` : *(string)* + build an `io.AudioSource` object using this file (currently only wave format is supported) + + `data_buffer`, `db` : *(string)* + build an `io.BufferAudioSource` using data in `data_buffer`. If this keyword is used, + `sampling_rate`, `sample_width` and `channels` are passed to `io.BufferAudioSource` + constructor and used instead of default values. + + `max_time`, `mt` : *(float)* + maximum time (in seconds) to read. Default behavior: read until there is no more data + available. + + `record`, `rec` : *(bool)* + save all read data in cache. Provide a navigable object which boasts a `rewind` method. + Default = False. + + `block_dur`, `bd` : *(float)* + processing block duration in seconds. This represents the quantity of audio data to return + each time the :func:`read` method is invoked. If `block_dur` is 0.025 (i.e. 25 ms) and the sampling + rate is 8000 and the sample width is 2 bytes, :func:`read` returns a buffer of 0.025 * 8000 * 2 = 400 + bytes at most. This parameter will be looked for (and used if available) before `block_size`. + If neither parameter is given, `block_dur` will be set to 0.01 second (i.e. 10 ms) + + + `hop_dur`, `hd` : *(float)* + quantity of data to skip from current processing window. if `hop_dur` is supplied then there + will be an overlap of `block_dur` - `hop_dur` between two adjacent blocks. This + parameter will be looked for (and used if available) before `hop_size`. If neither parameter + is given, `hop_dur` will be set to `block_dur` which means that there will be no overlap + between two consecutively read blocks. + + `block_size`, `bs` : *(int)* + number of samples to read each time the `read` method is called. Default: a block size + that represents a window of 10ms, so for a sampling rate of 16000, the default `block_size` + is 160 samples, for a rate of 44100, `block_size` = 441 samples, etc. + + `hop_size`, `hs` : *(int)* + determines the number of overlapping samples between two adjacent read windows. For a + `hop_size` of value *N*, the overlap is `block_size` - *N*. Default : `hop_size` = `block_size`, + means that there is no overlap. + + :Returns: + + An AudioDataSource object that has the desired features. + + :Exampels: + + 1. **Create an AudioDataSource that reads data from the microphone (requires Pyaudio) with default audio parameters:** + + .. code:: python + + from auditok import ADSFactory + ads = ADSFactory.ads() + ads.get_sampling_rate() + 16000 + ads.get_sample_width() + 2 + ads.get_channels() + 1 + + + 2. **Create an AudioDataSource that reads data from the microphone with a sampling rate of 48KHz:** + + .. code:: python + + from auditok import ADSFactory + ads = ADSFactory.ads(sr=48000) + ads.get_sampling_rate() + 48000 + + 3. **Create an AudioDataSource that reads data from a wave file:** + + .. code:: python + + import auditok + from auditok import ADSFactory + ads = ADSFactory.ads(fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ads.get_sampling_rate() + 44100 + ads.get_sample_width() + 2 + ads.get_channels() + 1 + + 4. **Define size of read blocks as 20 ms** + + .. code:: python + + import auditok + from auditok import ADSFactory + ''' + we know samling rate for previous file is 44100 samples/second + so 10 ms are equivalent to 441 samples and 20 ms to 882 + ''' + block_size = 882 + ads = ADSFactory.ads(bs = 882, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ads.open() + # read one block + data = ads.read() + ads.close() + len(data) + 1764 + assert len(data) == ads.get_sample_width() * block_size + + 5. **Define block size as a duration (use block_dur or bd):** + + .. code:: python + + import auditok + from auditok import ADSFactory + dur = 0.25 # second + ads = ADSFactory.ads(bd = dur, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ''' + we know samling rate for previous file is 44100 samples/second + for a block duration of 250 ms, block size should be 0.25 * 44100 = 11025 + ''' + ads.get_block_size() + 11025 + assert ads.get_block_size() == int(0.25 * 44100) + ads.open() + # read one block + data = ads.read() + ads.close() + len(data) + 22050 + assert len(data) == ads.get_sample_width() * ads.get_block_size() + + 6. **Read overlapping blocks (one of hope_size, hs, hop_dur or hd > 0):** + + For better readability we'd better use :class:`auditok.io.BufferAudioSource` with a string buffer: + + .. code:: python + + import auditok + from auditok import ADSFactory + ''' + we supply a data beffer instead of a file (keyword 'bata_buffer' or 'db') + sr : sampling rate = 16 samples/sec + sw : sample width = 1 byte + ch : channels = 1 + ''' + buffer = "abcdefghijklmnop" # 16 bytes = 1 second of data + bd = 0.250 # block duration = 250 ms = 4 bytes + hd = 0.125 # hop duration = 125 ms = 2 bytes + ads = ADSFactory.ads(db = "abcdefghijklmnop", bd = bd, hd = hd, sr = 16, sw = 1, ch = 1) + ads.open() + ads.read() + 'abcd' + ads.read() + 'cdef' + ads.read() + 'efgh' + ads.read() + 'ghij' + data = ads.read() + assert data == 'ijkl' + + 7. **Limit amount of read data (use max_time or mt):** + + .. code:: python + + ''' + We know audio file is larger than 2.25 seconds + We want to read up to 2.25 seconds of audio data + ''' + ads = ADSFactory.ads(mt = 2.25, fn=auditok.dataset.was_der_mensch_saet_mono_44100_lead_trail_silence) + ads.open() + data = [] + while True: + d = ads.read() + if d is None: + break + data.append(d) + + ads.close() + data = b''.join(data) + assert len(data) == int(ads.get_sampling_rate() * 2.25 * ads.get_sample_width() * ads.get_channels()) """ - warnings.warn( - "'ADSFactory' is deprecated and will be removed in a future " - "release. Please use AudioReader class instead.", - DeprecationWarning, - ) - + + # copy user's dicionary (shallow copy) + kwargs = kwargs.copy() + # check and normalize keyword arguments ADSFactory._check_normalize_args(kwargs) - + block_dur = kwargs.pop("bd") hop_dur = kwargs.pop("hd") block_size = kwargs.pop("bs") @@ -628,483 +452,431 @@ class ADSFactory: filename = kwargs.pop("fn") data_buffer = kwargs.pop("db") record = kwargs.pop("rec") - + # Case 1: an audio source is supplied if audio_source is not None: if (filename, data_buffer) != (None, None): - raise Warning( - "You should provide one of 'audio_source', 'filename' or \ - 'data_buffer' keyword parameters. 'audio_source' will be \ - used" - ) - + raise Warning("You should provide one of 'audio_source', 'filename' or 'data_buffer'\ + keyword parameters. 'audio_source' will be used") + # Case 2: a file name is supplied elif filename is not None: if data_buffer is not None: - raise Warning( - "You should provide one of 'filename' or 'data_buffer'\ - keyword parameters. 'filename' will be used" - ) + raise Warning("You should provide one of 'filename' or 'data_buffer'\ + keyword parameters. 'filename' will be used") audio_source = from_file(filename) - - # Case 3: a data_buffer is supplied + + # Case 3: a data_buffer is supplied elif data_buffer is not None: - audio_source = BufferAudioSource(data=data_buffer, **kwargs) - + audio_source = BufferAudioSource(data_buffer = data_buffer, **kwargs) + # Case 4: try to access native audio input else: audio_source = PyAudioSource(**kwargs) - + + if block_dur is not None: if block_size is not None: - raise DuplicateArgument( - "Either 'block_dur' or 'block_size' can be specified, not \ - both" - ) - elif block_size is not None: - block_dur = block_size / audio_source.sr - else: - block_dur = 0.01 # 10 ms - + raise DuplicateArgument("Either 'block_dur' or 'block_size' can be specified, not both") + else: + block_size = int(audio_source.get_sampling_rate() * block_dur) + elif block_size is None: + # Set default block_size to 10 ms + block_size = int(audio_source.get_sampling_rate() / 100) + + # Instantiate base AudioDataSource + ads = ADSFactory.AudioDataSource(audio_source=audio_source, block_size=block_size) + + # Limit data to be read + if max_time is not None: + ads = ADSFactory.LimiterADS(ads=ads, max_time=max_time) + + # Record, rewind and reuse data + if record: + ads = ADSFactory.RecorderADS(ads=ads) + # Read overlapping blocks of data if hop_dur is not None: if hop_size is not None: - raise DuplicateArgument( - "Either 'hop_dur' or 'hop_size' can be specified, not both" - ) - elif hop_size is not None: - hop_dur = hop_size / audio_source.sr - - ads = AudioDataSource( - audio_source, - block_dur=block_dur, - hop_dur=hop_dur, - record=record, - max_read=max_time, - ) + raise DuplicateArgument("Either 'hop_dur' or 'hop_size' can be specified, not both") + else: + hop_size = int(audio_source.get_sampling_rate() * hop_dur) + + if hop_size is not None: + if hop_size <= 0 or hop_size > block_size: + raise ValueError("hop_size must be > 0 and <= block_size") + if hop_size < block_size: + ads = ADSFactory.OverlapADS(ads=ads, hop_size=hop_size) + return ads + + + class AudioDataSource(DataSource): + """ + Base class for AudioDataSource objects. + It inherits from DataSource and encapsulates an AudioSource object. + """ + + def __init__(self, audio_source, block_size): + + self.audio_source = audio_source + self.block_size = block_size + + def get_block_size(self): + return self.block_size + + def set_block_size(self, size): + self.block_size = size + + def get_audio_source(self): + return self.audio_source + + def set_audio_source(self, audio_source): + self.audio_source = audio_source + + def open(self): + self.audio_source.open() + + def close(self): + self.audio_source.close() + + def is_open(self): + return self.audio_source.is_open() + + def get_sampling_rate(self): + return self.audio_source.get_sampling_rate() + + def get_sample_width(self): + return self.audio_source.get_sample_width() + + def get_channels(self): + return self.audio_source.get_channels() + + + def rewind(self): + if isinstance(self.audio_source, Rewindable): + self.audio_source.rewind() + else: + raise Exception("Audio source is not rewindable") + + + + def is_rewindable(self): + return isinstance(self.audio_source, Rewindable) + + + def read(self): + return self.audio_source.read(self.block_size) + + + class ADSDecorator(AudioDataSource): + """ + Base decorator class for AudioDataSource objects. + """ + __metaclass__ = ABCMeta + + def __init__(self, ads): + self.ads = ads + + self.get_block_size = self.ads.get_block_size + self.set_block_size = self.ads.set_block_size + self.get_audio_source = self.ads.get_audio_source + self.open = self.ads.open + self.close = self.ads.close + self.is_open = self.ads.is_open + self.get_sampling_rate = self.ads.get_sampling_rate + self.get_sample_width = self.ads.get_sample_width + self.get_channels = self.ads.get_channels + + def is_rewindable(self): + return self.ads.is_rewindable + + def rewind(self): + self.ads.rewind() + self._reinit() + + def set_audio_source(self, audio_source): + self.ads.set_audio_source(audio_source) + self._reinit() + + def open(self): + if not self.ads.is_open(): + self.ads.open() + self._reinit() + + @abstractmethod + def _reinit(self): + pass + + + class OverlapADS(ADSDecorator): + """ + A class for AudioDataSource objects that can read and return overlapping audio frames + """ + + def __init__(self, ads, hop_size): + ADSFactory.ADSDecorator.__init__(self, ads) + + if hop_size <= 0 or hop_size > self.get_block_size(): + raise ValueError("hop_size must be either 'None' or \ + between 1 and block_size (both inclusive)") + self.hop_size = hop_size + self._actual_block_size = self.get_block_size() + self._reinit() + + + def _get_block_size(): + return self._actual_block_size + + + def _read_first_block(self): + # For the first call, we need an entire block of size 'block_size' + block = self.ads.read() + if block is None: + return None + + # Keep a slice of data in cache and append it in the next call + if len(block) > self._hop_size_bytes: + self._cache = block[self._hop_size_bytes:] + + # Up from the next call, we will use '_read_next_blocks' + # and we only read 'hop_size' + self.ads.set_block_size(self.hop_size) + self.read = self._read_next_blocks + + return block + + def _read_next_blocks(self): + block = self.ads.read() + if block is None: + return None + + # Append block to cache data to ensure overlap + block = self._cache + block + # Keep a slice of data in cache only if we have a full length block + # if we don't that means that this is the last block + if len(block) == self._block_size_bytes: + self._cache = block[self._hop_size_bytes:] + else: + self._cache = None + + return block - -class _AudioReadingProxy: - def __init__(self, audio_source): - - self._audio_source = audio_source - - def rewind(self): - if self.rewindable: - self._audio_source.rewind() - else: - raise AudioIOError("Audio stream is not rewindable") - - def rewindable(self): - try: - return self._audio_source.rewindable - except AttributeError: - return False - - def is_open(self): - return self._audio_source.is_open() - - def open(self): - self._audio_source.open() - - def close(self): - self._audio_source.close() - - def read(self, size): - return self._audio_source.read(size) - - @property - def data(self): - err_msg = "This AudioReader is not a recorder, no recorded data can " - err_msg += "be retrieved" - raise AttributeError(err_msg) - - def __getattr__(self, name): - return getattr(self._audio_source, name) - - -class _Recorder(_AudioReadingProxy): - """ - Class for `AudioReader` objects that can record all data they read. Useful - when reading data from microphone. - """ - - def __init__(self, audio_source): - super(_Recorder, self).__init__(audio_source) - self._cache = [] - self._read_block = self._read_and_cache - self._read_from_cache = False - self._data = None - - def read(self, size): - return self._read_block(size) - - @property - def data(self): - if self._data is None: - err_msg = "Unrewinded recorder. `rewind` should be called before " - err_msg += "accessing recorded data" - raise RuntimeError(err_msg) - return self._data - - def rewindable(self): - return True - - def rewind(self): - if self._read_from_cache: - self._audio_source.rewind() - else: - self._data = b"".join(self._cache) + def read(self): + pass + + def _reinit(self): self._cache = None - self._audio_source = BufferAudioSource( - self._data, self.sr, self.sw, self.ch - ) - self._read_block = self._audio_source.read - self.open() - self._read_from_cache = True - - def _read_and_cache(self, size): - # Read and save read data - block = self._audio_source.read(size) - if block is not None: - self._cache.append(block) - return block - - -class _Limiter(_AudioReadingProxy): - """ - Class for `AudioReader` objects that can read a fixed amount of data. - This can be useful when reading data from the microphone or from large - audio files. - """ - - def __init__(self, audio_source, max_read): - super(_Limiter, self).__init__(audio_source) - self._max_read = max_read - self._max_samples = round(max_read * self.sr) - self._bytes_per_sample = self.sw * self.ch - self._read_samples = 0 - - @property - def data(self): - data = self._audio_source.data - max_read_bytes = self._max_samples * self._bytes_per_sample - return data[:max_read_bytes] - - @property - def max_read(self): - return self._max_read - - def read(self, size): - size = min(self._max_samples - self._read_samples, size) - if size <= 0: - return None - block = self._audio_source.read(size) - if block is None: - return None - self._read_samples += len(block) // self._bytes_per_sample - return block - - def rewind(self): - super(_Limiter, self).rewind() - self._read_samples = 0 - - -class _FixedSizeAudioReader(_AudioReadingProxy): - """ - Class to read fixed-size audio windows from source. - """ - - def __init__(self, audio_source, block_dur): - super(_FixedSizeAudioReader, self).__init__(audio_source) - - if block_dur <= 0: - raise ValueError( - "block_dur must be > 0, given: {}".format(block_dur) - ) - - self._block_size = int(block_dur * self.sr) - if self._block_size == 0: - err_msg = "Too small block_dur ({0:f}) for sampling rate ({1}). " - err_msg += "block_dur should cover at least one sample " - err_msg += "(i.e. 1/{1})" - raise TooSamllBlockDuration( - err_msg.format(block_dur, self.sr), block_dur, self.sr - ) - - def read(self): - return self._audio_source.read(self._block_size) - - @property - def block_size(self): - return self._block_size - - @property - def block_dur(self): - return self._block_size / self.sr - - def __getattr__(self, name): - return getattr(self._audio_source, name) - - -class _OverlapAudioReader(_FixedSizeAudioReader): - """ - Class for `AudioReader` objects that can read and return overlapping audio - windows. - """ + self.ads.set_block_size(self._actual_block_size) + self._hop_size_bytes = self.hop_size * \ + self.get_sample_width() * \ + self.get_channels() + self._block_size_bytes = self.get_block_size() * \ + self.get_sample_width() * \ + self.get_channels() + self.read = self._read_first_block - def __init__(self, audio_source, block_dur, hop_dur): - if hop_dur >= block_dur: - raise ValueError('"hop_dur" should be < "block_dur"') - super(_OverlapAudioReader, self).__init__(audio_source, block_dur) - - self._hop_size = int(hop_dur * self.sr) - self._blocks = self._iter_blocks_with_overlap() - - def _iter_blocks_with_overlap(self): - while not self.is_open(): - yield AudioIOError - block = self._audio_source.read(self._block_size) - if block is None: - yield None - - _hop_size_bytes = ( - self._hop_size * self._audio_source.sw * self._audio_source.ch - ) - cache = block[_hop_size_bytes:] - yield block - - while True: - block = self._audio_source.read(self._hop_size) - if block: - block = cache + block - cache = block[_hop_size_bytes:] - yield block - continue - yield None - - def read(self): - try: - block = next(self._blocks) - if block == AudioIOError: - raise AudioIOError("Audio Stream is not open.") + class LimiterADS(ADSDecorator): + """ + A class for AudioDataSource objects that can read a fixed amount of data. + This can be useful when reading data from the microphone or from large audio files. + """ + + def __init__(self, ads, max_time): + ADSFactory.ADSDecorator.__init__(self, ads) + + self.max_time = max_time + self._reinit() + + def read(self): + if self._total_read_bytes >= self._max_read_bytes: + return None + block = self.ads.read() + if block is None: + return None + self._total_read_bytes += len(block) + + if self._total_read_bytes >= self._max_read_bytes: + self.close() + return block - except StopIteration: - return None - - def rewind(self): - super(_OverlapAudioReader, self).rewind() - self._blocks = self._iter_blocks_with_overlap() - - @property - def hop_size(self): - return self._hop_size - - @property - def hop_dur(self): - return self._hop_size / self.sr - - def __getattr__(self, name): - return getattr(self._audio_source, name) - + + + def _reinit(self): + self._max_read_bytes = int(self.max_time * self.get_sampling_rate()) * \ + self.get_sample_width() * \ + self.get_channels() + self._total_read_bytes = 0 -class AudioReader(DataSource): - """ - Class to read fixed-size chunks of audio data from a source. A source can - be a file on disk, standard input (with `input` = "-") or microphone. This - is normally used by tokenization algorithms that expect source objects with - a `read` function that returns a windows of data of the same size at each - call expect when remaining data does not make up a full window. - - Objects of this class can be set up to return audio windows with a given - overlap and to record the whole stream for later access (useful when - reading data from the microphone). They can also have - a limit for the maximum amount of data to read. - - Parameters - ---------- - input : str, bytes, AudioSource, AudioReader, AudioRegion or None - input audio data. If the type of the passed argument is `str`, it should - be a path to an existing audio file. "-" is interpreted as standardinput. - If the type is `bytes`, input is considered as a buffer of raw audio - data. If None, read audio from microphone. Every object that is not an - :class:`AudioReader` will be transformed, when possible, into an - :class:`AudioSource` before processing. If it is an `str` that refers to - a raw audio file, `bytes` or None, audio parameters should be provided - using kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or - their alias). - block_dur: float, default: 0.01 - length in seconds of audio windows to return at each `read` call. - hop_dur: float, default: None - length in seconds of data amount to skip from previous window. If - defined, it is used to compute the temporal overlap between previous and - current window (nameply `overlap = block_dur - hop_dur`). Default, None, - means that consecutive windows do not overlap. - record: bool, default: False - whether to record read audio data for later access. If True, audio data - can be retrieved by first calling `rewind()`, then using the `data` - property. Note that once `rewind()` is called, no new data will be read - from source (subsequent `read()` call will read data from cache) and - that there's no need to call `rewind()` again to access `data` property. - max_read: float, default: None - maximum amount of audio data to read in seconds. Default is None meaning - that data will be read until end of stream is reached or, when reading - from microphone a Ctrl-C is sent. - - When `input` is None, of type bytes or a raw audio files some of the - follwing kwargs are mandatory. + - Other Parameters - ---------------- - audio_format, fmt : str - type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be - used if `input` is a string path to an audio file. If not given, audio - type will be guessed from file name extension or from file header. - sampling_rate, sr : int - sampling rate of audio data. Required if `input` is a raw audio file, is - a bytes object or None (i.e., read from microphone). - sample_width, sw : int - number of bytes used to encode one audio sample, typically 1, 2 or 4. - Required for raw data, see `sampling_rate`. - channels, ch : int - number of channels of audio data. Required for raw data, see - `sampling_rate`. - use_channel, uc : {None, "any", "mix", "avg", "average"} or int - which channel to use for split if `input` has multiple audio channels. - Regardless of which channel is used for splitting, returned audio events - contain data from *all* the channels of `input`. The following values - are accepted: - - - None (alias "any"): accept audio activity from any channel, even if - other channels are silent. This is the default behavior. - - - "mix" (alias "avg" or "average"): mix down all channels (i.e., compute - average channel) and split the resulting channel. + class RecorderADS(ADSDecorator): + """ + A class for AudioDataSource objects that can record all audio data they read, + with a rewind facility. + """ + + def __init__(self, ads): + ADSFactory.ADSDecorator.__init__(self, ads) + + self._reinit() + + def read(self): + pass + + def _read_and_rec(self): + # Read and save read data + block = self.ads.read() + if block is not None: + self._cache.append(block) + + return block + + + def _read_simple(self): + # Read without recording + return self.ads.read() + + def rewind(self): + if self._record: + # If has been recording, create a new BufferAudioSource + # from recorded data + dbuffer = self._concatenate(self._cache) + asource = BufferAudioSource(dbuffer, self.get_sampling_rate(), + self.get_sample_width(), + self.get_channels()) + + + self.set_audio_source(asource) + self.open() + self._cache = [] + self._record = False + self.read = self._read_simple + + else: + self.ads.rewind() + if not self.is_open(): + self.open() + + + def is_rewindable(self): + return True + + def _reinit(self): + # when audio_source is replaced, start recording again + self._record = True + self._cache = [] + self.read = self._read_and_rec + + def _concatenate(self, data): + try: + # should always work for python 2 + # work for python 3 ONLY if data is a list (or an iterator) + # whose each element is a 'bytes' objects + return b''.join(data) + except TypeError: + # work for 'str' in python 2 and python 3 + return ''.join(data) - - int (>= 0 , < `channels`): use one channel, specified by its integer - id, for split. - large_file : bool, default: False - If True, AND if `input` is a path to a *wav* of a *raw* audio file - (and only these two formats) then audio data is lazily loaded to memory - (i.e., one analysis window a time). Otherwise the whole file is loaded - to memory before split. Set to True if the size of the file is larger - than available memory. +class AudioEnergyValidator(DataValidator): """ - - def __init__( - self, - input, - block_dur=0.01, - hop_dur=None, - record=False, - max_read=None, - **kwargs - ): - if not isinstance(input, AudioSource): - input = get_audio_source(input, **kwargs) - self._record = record - if record: - input = _Recorder(input) - if max_read is not None: - input = _Limiter(input, max_read) - self._max_read = max_read - if hop_dur is not None: - input = _OverlapAudioReader(input, block_dur, hop_dur) - else: - input = _FixedSizeAudioReader(input, block_dur) - self._audio_source = input - - def __repr__(self): - block_dur, hop_dur, max_read = None, None, None - if self.block_dur is not None: - block_dur = "{:.3f}".format(self.block_dur) - if self.hop_dur is not None: - hop_dur = "{:.3f}".format(self.hop_dur) - if self.max_read is not None: - max_read = "{:.3f}".format(self.max_read) - return ( - "{cls}(block_dur={block_dur}, " - "hop_dur={hop_dur}, record={rewindable}, " - "max_read={max_read})" - ).format( - cls=self.__class__.__name__, - block_dur=block_dur, - hop_dur=hop_dur, - rewindable=self._record, - max_read=max_read, - ) - - @property - def rewindable(self): - return self._record - - @property - def block_dur(self): - return self._audio_source.block_size / self._audio_source.sr - - @property - def hop_dur(self): - if hasattr(self._audio_source, "hop_dur"): - return self._audio_source.hop_size / self._audio_source.sr - return self.block_dur - - @property - def hop_size(self): - if hasattr(self._audio_source, "hop_size"): - return self._audio_source.hop_size - return self.block_size - - @property - def max_read(self): - try: - return self._audio_source.max_read - except AttributeError: - return None - - def read(self): - return self._audio_source.read() - - def __getattr__(self, name): - if name in ("data", "rewind") and not self.rewindable: - raise AttributeError( - "'AudioReader' has no attribute '{}'".format(name) - ) - try: - return getattr(self._audio_source, name) - except AttributeError: - raise AttributeError( - "'AudioReader' has no attribute '{}'".format(name) - ) - - -# Keep AudioDataSource for compatibility -# Remove in a future version when ADSFactory is removed -AudioDataSource = AudioReader - - -class Recorder(AudioReader): - """Class to read fixed-size chunks of audio data from a source and keeps - data in a cache. Using this class is equivalent to initializing - :class:`AudioReader` with `record=True`. For more information about the - other parameters see :class:`AudioReader`. - - Once the desired amount of data is read, you can call the :func:`rewind` - method then get the recorded data via the :attr:`data` attribute. You can also - re-read cached data one window a time by calling :func:`read`. + The most basic auditok audio frame validator. + This validator computes the log energy of an input audio frame + and return True if the result is >= a given threshold, False + otherwise. + + :Parameters: + + `sample_width` : *(int)* + Number of bytes of one audio sample. This is used to convert data from `basestring` or `Bytes` to + an array of floats. + + `energy_threshold` : *(float)* + A threshold used to check whether an input data buffer is valid. """ + + + if _WITH_NUMPY: + + _formats = {1: numpy.int8 , 2: numpy.int16, 4: numpy.int32} + + @staticmethod + def _convert(signal, sample_width): + return numpy.array(numpy.frombuffer(signal, dtype=AudioEnergyValidator._formats[sample_width]), dtype=numpy.float64) + + @staticmethod + def _signal_energy(signal): + return float(numpy.dot(signal, signal)) / len(signal) + + @staticmethod + def _signal_log_energy(signal): + energy = AudioEnergyValidator._signal_energy(signal) + if energy <= 0: + return -200 + return 10. * numpy.log10(energy) + + else: + + + _formats = {1: 'b' , 2: 'h', 4: 'i'} + + @staticmethod + def _convert(signal, sample_width): + return array("d", array(AudioEnergyValidator._formats[sample_width], signal)) + + @staticmethod + def _signal_energy(signal): + energy = 0. + for a in signal: + energy += a * a + return energy / len(signal) + + @staticmethod + def _signal_log_energy(signal): + energy = AudioEnergyValidator._signal_energy(signal) + if energy <= 0: + return -200 + return 10. * math.log10(energy) + + + def __init__(self, sample_width, energy_threshold=45): + self.sample_width = sample_width + self._energy_threshold = energy_threshold + + + def is_valid(self, data): + """ + Check if data is valid. Audio data will be converted into an array (of + signed values) of which the log energy is computed. Log energy is computed + as follows: + + .. code:: python + + arr = AudioEnergyValidator._convert(signal, sample_width) + energy = float(numpy.dot(arr, arr)) / len(arr) + log_energy = 10. * numpy.log10(energy) + + + :Parameters: + + `data` : either a *string* or a *Bytes* buffer + `data` is converted into a numerical array using the `sample_width` + given in the constructor. + + :Retruns: + + True if `log_energy` >= `energy_threshold`, False otherwise. + """ + + signal = AudioEnergyValidator._convert(data, self.sample_width) + return AudioEnergyValidator._signal_log_energy(signal) >= self._energy_threshold + + def get_energy_threshold(self): + return self._energy_threshold + + def set_energy_threshold(self, threshold): + self._energy_threshold = threshold - def __init__( - self, input, block_dur=0.01, hop_dur=None, max_read=None, **kwargs - ): - super().__init__( - input, - block_dur=block_dur, - hop_dur=hop_dur, - record=True, - max_read=max_read, - **kwargs - ) diff --git a/libs/auditok/workers.py b/libs/auditok/workers.py deleted file mode 100755 index bb6d54a98..000000000 --- a/libs/auditok/workers.py +++ /dev/null @@ -1,427 +0,0 @@ -import os -import sys -from tempfile import NamedTemporaryFile -from abc import ABCMeta, abstractmethod -from threading import Thread -from datetime import datetime, timedelta -from collections import namedtuple -import wave -import subprocess -from queue import Queue, Empty -from .io import _guess_audio_format -from .util import AudioDataSource, make_duration_formatter -from .core import split -from .exceptions import ( - EndOfProcessing, - AudioEncodingError, - AudioEncodingWarning, -) - - -_STOP_PROCESSING = "STOP_PROCESSING" -_Detection = namedtuple("_Detection", "id start end duration") - - -def _run_subprocess(command): - try: - with subprocess.Popen( - command, - stdin=open(os.devnull, "rb"), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) as proc: - stdout, stderr = proc.communicate() - return proc.returncode, stdout, stderr - except Exception: - err_msg = "Couldn't export audio using command: '{}'".format(command) - raise AudioEncodingError(err_msg) - - -class Worker(Thread, metaclass=ABCMeta): - def __init__(self, timeout=0.5, logger=None): - self._timeout = timeout - self._logger = logger - self._inbox = Queue() - Thread.__init__(self) - - def run(self): - while True: - message = self._get_message() - if message == _STOP_PROCESSING: - break - if message is not None: - self._process_message(message) - self._post_process() - - @abstractmethod - def _process_message(self, message): - """Process incoming messages""" - - def _post_process(self): - pass - - def _log(self, message): - self._logger.info(message) - - def _stop_requested(self): - try: - message = self._inbox.get_nowait() - if message == _STOP_PROCESSING: - return True - except Empty: - return False - - def stop(self): - self.send(_STOP_PROCESSING) - self.join() - - def send(self, message): - self._inbox.put(message) - - def _get_message(self): - try: - message = self._inbox.get(timeout=self._timeout) - return message - except Empty: - return None - - -class TokenizerWorker(Worker, AudioDataSource): - def __init__(self, reader, observers=None, logger=None, **kwargs): - self._observers = observers if observers is not None else [] - self._reader = reader - self._audio_region_gen = split(self, **kwargs) - self._detections = [] - self._log_format = "[DET]: Detection {0.id} (start: {0.start:.3f}, " - self._log_format += "end: {0.end:.3f}, duration: {0.duration:.3f})" - Worker.__init__(self, timeout=0.2, logger=logger) - - def _process_message(self): - pass - - @property - def detections(self): - return self._detections - - def _notify_observers(self, message): - for observer in self._observers: - observer.send(message) - - def run(self): - self._reader.open() - start_processing_timestamp = datetime.now() - for _id, audio_region in enumerate(self._audio_region_gen, start=1): - timestamp = start_processing_timestamp + timedelta( - seconds=audio_region.meta.start - ) - audio_region.meta.timestamp = timestamp - detection = _Detection( - _id, - audio_region.meta.start, - audio_region.meta.end, - audio_region.duration, - ) - self._detections.append(detection) - if self._logger is not None: - message = self._log_format.format(detection) - self._log(message) - self._notify_observers((_id, audio_region)) - self._notify_observers(_STOP_PROCESSING) - self._reader.close() - - def start_all(self): - for observer in self._observers: - observer.start() - self.start() - - def stop_all(self): - self.stop() - for observer in self._observers: - observer.stop() - self._reader.close() - - def read(self): - if self._stop_requested(): - return None - else: - return self._reader.read() - - def __getattr__(self, name): - return getattr(self._reader, name) - - -class StreamSaverWorker(Worker): - def __init__( - self, - audio_reader, - filename, - export_format=None, - cache_size_sec=0.5, - timeout=0.2, - ): - self._reader = audio_reader - sample_size_bytes = self._reader.sw * self._reader.ch - self._cache_size = cache_size_sec * self._reader.sr * sample_size_bytes - self._output_filename = filename - self._export_format = _guess_audio_format(export_format, filename) - if self._export_format is None: - self._export_format = "wav" - self._init_output_stream() - self._exported = False - self._cache = [] - self._total_cached = 0 - Worker.__init__(self, timeout=timeout) - - def _get_non_existent_filename(self): - filename = self._output_filename + ".wav" - i = 0 - while os.path.exists(filename): - i += 1 - filename = self._output_filename + "({}).wav".format(i) - return filename - - def _init_output_stream(self): - if self._export_format != "wav": - self._tmp_output_filename = self._get_non_existent_filename() - else: - self._tmp_output_filename = self._output_filename - self._wfp = wave.open(self._tmp_output_filename, "wb") - self._wfp.setframerate(self._reader.sr) - self._wfp.setsampwidth(self._reader.sw) - self._wfp.setnchannels(self._reader.ch) - - @property - def sr(self): - return self._reader.sampling_rate - - @property - def sw(self): - return self._reader.sample_width - - @property - def ch(self): - return self._reader.channels - - def __del__(self): - self._post_process() - - if ( - (self._tmp_output_filename != self._output_filename) - and self._exported - and os.path.exists(self._tmp_output_filename) - ): - os.remove(self._tmp_output_filename) - - def _process_message(self, data): - self._cache.append(data) - self._total_cached += len(data) - if self._total_cached >= self._cache_size: - self._write_cached_data() - - def _post_process(self): - while True: - try: - data = self._inbox.get_nowait() - if data != _STOP_PROCESSING: - self._cache.append(data) - self._total_cached += len(data) - except Empty: - break - self._write_cached_data() - self._wfp.close() - - def _write_cached_data(self): - if self._cache: - data = b"".join(self._cache) - self._wfp.writeframes(data) - self._cache = [] - self._total_cached = 0 - - def open(self): - self._reader.open() - - def close(self): - self._reader.close() - self.stop() - - def rewind(self): - # ensure compatibility with AudioDataSource with record=True - pass - - @property - def data(self): - with wave.open(self._tmp_output_filename, "rb") as wfp: - return wfp.readframes(-1) - - def save_stream(self): - if self._exported: - return self._output_filename - - if self._export_format in ("raw", "wav"): - if self._export_format == "raw": - self._export_raw() - self._exported = True - return self._output_filename - try: - self._export_with_ffmpeg_or_avconv() - except AudioEncodingError: - try: - self._export_with_sox() - except AudioEncodingError: - warn_msg = "Couldn't save audio data in the desired format " - warn_msg += "'{}'. Either none of 'ffmpeg', 'avconv' or 'sox' " - warn_msg += "is installed or this format is not recognized.\n" - warn_msg += "Audio file was saved as '{}'" - raise AudioEncodingWarning( - warn_msg.format( - self._export_format, self._tmp_output_filename - ) - ) - finally: - self._exported = True - return self._output_filename - - def _export_raw(self): - with open(self._output_filename, "wb") as wfp: - wfp.write(self.data) - - def _export_with_ffmpeg_or_avconv(self): - command = [ - "-y", - "-f", - "wav", - "-i", - self._tmp_output_filename, - "-f", - self._export_format, - self._output_filename, - ] - returncode, stdout, stderr = _run_subprocess(["ffmpeg"] + command) - if returncode != 0: - returncode, stdout, stderr = _run_subprocess(["avconv"] + command) - if returncode != 0: - raise AudioEncodingError(stderr) - return stdout, stderr - - def _export_with_sox(self): - command = [ - "sox", - "-t", - "wav", - self._tmp_output_filename, - self._output_filename, - ] - returncode, stdout, stderr = _run_subprocess(command) - if returncode != 0: - raise AudioEncodingError(stderr) - return stdout, stderr - - def close_output(self): - self._wfp.close() - - def read(self): - data = self._reader.read() - if data is not None: - self.send(data) - else: - self.send(_STOP_PROCESSING) - return data - - def __getattr__(self, name): - if name == "data": - return self.data - return getattr(self._reader, name) - - -class PlayerWorker(Worker): - def __init__(self, player, progress_bar=False, timeout=0.2, logger=None): - self._player = player - self._progress_bar = progress_bar - self._log_format = "[PLAY]: Detection {id} played" - Worker.__init__(self, timeout=timeout, logger=logger) - - def _process_message(self, message): - _id, audio_region = message - if self._logger is not None: - message = self._log_format.format(id=_id) - self._log(message) - audio_region.play( - player=self._player, progress_bar=self._progress_bar, leave=False - ) - - -class RegionSaverWorker(Worker): - def __init__( - self, - filename_format, - audio_format=None, - timeout=0.2, - logger=None, - **audio_parameters - ): - self._filename_format = filename_format - self._audio_format = audio_format - self._audio_parameters = audio_parameters - self._debug_format = "[SAVE]: Detection {id} saved as '{filename}'" - Worker.__init__(self, timeout=timeout, logger=logger) - - def _process_message(self, message): - _id, audio_region = message - filename = self._filename_format.format( - id=_id, - start=audio_region.meta.start, - end=audio_region.meta.end, - duration=audio_region.duration, - ) - filename = audio_region.save( - filename, self._audio_format, **self._audio_parameters - ) - if self._logger: - message = self._debug_format.format(id=_id, filename=filename) - self._log(message) - - -class CommandLineWorker(Worker): - def __init__(self, command, timeout=0.2, logger=None): - self._command = command - Worker.__init__(self, timeout=timeout, logger=logger) - self._debug_format = "[COMMAND]: Detection {id} command: '{command}'" - - def _process_message(self, message): - _id, audio_region = message - with NamedTemporaryFile(delete=False) as file: - filename = audio_region.save(file.name, audio_format="wav") - command = self._command.format(file=filename) - os.system(command) - if self._logger is not None: - message = self._debug_format.format(id=_id, command=command) - self._log(message) - - -class PrintWorker(Worker): - def __init__( - self, - print_format="{start} {end}", - time_format="%S", - timestamp_format="%Y/%m/%d %H:%M:%S.%f", - timeout=0.2, - ): - - self._print_format = print_format - self._format_time = make_duration_formatter(time_format) - self._timestamp_format = timestamp_format - self.detections = [] - Worker.__init__(self, timeout=timeout) - - def _process_message(self, message): - _id, audio_region = message - timestamp = audio_region.meta.timestamp - timestamp = timestamp.strftime(self._timestamp_format) - text = self._print_format.format( - id=_id, - start=self._format_time(audio_region.meta.start), - end=self._format_time(audio_region.meta.end), - duration=self._format_time(audio_region.duration), - timestamp=timestamp, - ) - print(text) diff --git a/libs/version.txt b/libs/version.txt index d31ee88bd..d82f63262 100644 --- a/libs/version.txt +++ b/libs/version.txt @@ -52,7 +52,7 @@ tzlocal==4.1 soupsieve==2.3.1 # Required-by: ffsubsync -auditok==0.2.0 +auditok==0.1.5 # do not upgrade unless ffsubsync requirements.txt change ffmpeg-python==0.2.0 future==0.18.2 rich==11.0.0