|
|
|
"""
|
|
|
|
.. autosummary::
|
|
|
|
:toctree: generated/
|
|
|
|
|
|
|
|
load
|
|
|
|
split
|
|
|
|
AudioRegion
|
|
|
|
StreamTokenizer
|
|
|
|
"""
|
|
|
|
import os
|
|
|
|
import math
|
|
|
|
from .util import AudioReader, DataValidator, AudioEnergyValidator
|
|
|
|
from .io import check_audio_data, to_file, player_for, get_audio_source
|
|
|
|
from .exceptions import TooSamllBlockDuration
|
|
|
|
|
|
|
|
try:
|
|
|
|
from . import signal_numpy as signal
|
|
|
|
except ImportError:
|
|
|
|
from . import signal
|
|
|
|
|
|
|
|
__all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_ANALYSIS_WINDOW = 0.05
|
|
|
|
DEFAULT_ENERGY_THRESHOLD = 50
|
|
|
|
_EPSILON = 1e-10
|
|
|
|
|
|
|
|
|
|
|
|
def load(input, skip=0, max_read=None, **kwargs):
|
|
|
|
"""Load audio data from a source and return it as an :class:`AudioRegion`.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
input : None, str, bytes, AudioSource
|
|
|
|
source to read audio data from. If `str`, it should be a path to a
|
|
|
|
valid audio file. If `bytes`, it is used as raw audio data. If it is
|
|
|
|
"-", raw data will be read from stdin. If None, read audio data from
|
|
|
|
the microphone using PyAudio. If of type `bytes` or is a path to a
|
|
|
|
raw audio file then `sampling_rate`, `sample_width` and `channels`
|
|
|
|
parameters (or their alias) are required. If it's an
|
|
|
|
:class:`AudioSource` object it's used directly to read data.
|
|
|
|
skip : float, default: 0
|
|
|
|
amount, in seconds, of audio data to skip from source. If read from
|
|
|
|
a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
|
|
|
|
max_read : float, default: None
|
|
|
|
amount, in seconds, of audio data to read from source. If read from
|
|
|
|
microphone, `max_read` should not be None, otherwise a `ValueError` is
|
|
|
|
raised.
|
|
|
|
audio_format, fmt : str
|
|
|
|
type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
|
|
|
|
be used if `input` is a string path to an audio file. If not given,
|
|
|
|
audio type will be guessed from file name extension or from file
|
|
|
|
header.
|
|
|
|
sampling_rate, sr : int
|
|
|
|
sampling rate of audio data. Required if `input` is a raw audio file,
|
|
|
|
a `bytes` object or None (i.e., read from microphone).
|
|
|
|
sample_width, sw : int
|
|
|
|
number of bytes used to encode one audio sample, typically 1, 2 or 4.
|
|
|
|
Required for raw data, see `sampling_rate`.
|
|
|
|
channels, ch : int
|
|
|
|
number of channels of audio data. Required for raw data, see
|
|
|
|
`sampling_rate`.
|
|
|
|
large_file : bool, default: False
|
|
|
|
If True, AND if `input` is a path to a *wav* of a *raw* audio file
|
|
|
|
(and **only** these two formats) then audio file is not fully loaded to
|
|
|
|
memory in order to create the region (but the portion of data needed to
|
|
|
|
create the region is of course loaded to memory). Set to True if
|
|
|
|
`max_read` is significantly smaller then the size of a large audio file
|
|
|
|
that shouldn't be entirely loaded to memory.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
region: AudioRegion
|
|
|
|
|
|
|
|
Raises
|
|
|
|
------
|
|
|
|
ValueError
|
|
|
|
raised if `input` is None (i.e., read data from microphone) and `skip`
|
|
|
|
!= 0 or `input` is None `max_read` is None (meaning that when reading
|
|
|
|
from the microphone, no data should be skipped, and maximum amount of
|
|
|
|
data to read should be explicitly provided).
|
|
|
|
"""
|
|
|
|
return AudioRegion.load(input, skip, max_read, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def split(
|
|
|
|
input,
|
|
|
|
min_dur=0.2,
|
|
|
|
max_dur=5,
|
|
|
|
max_silence=0.3,
|
|
|
|
drop_trailing_silence=False,
|
|
|
|
strict_min_dur=False,
|
|
|
|
**kwargs
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
Split audio data and return a generator of AudioRegions
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
input : str, bytes, AudioSource, AudioReader, AudioRegion or None
|
|
|
|
input audio data. If str, it should be a path to an existing audio file.
|
|
|
|
"-" is interpreted as standard input. If bytes, input is considered as
|
|
|
|
raw audio data. If None, read audio from microphone.
|
|
|
|
Every object that is not an `AudioReader` will be transformed into an
|
|
|
|
`AudioReader` before processing. If it is an `str` that refers to a raw
|
|
|
|
audio file, `bytes` or None, audio parameters should be provided using
|
|
|
|
kwargs (i.e., `samplig_rate`, `sample_width` and `channels` or their
|
|
|
|
alias).
|
|
|
|
If `input` is str then audio format will be guessed from file extension.
|
|
|
|
`audio_format` (alias `fmt`) kwarg can also be given to specify audio
|
|
|
|
format explicitly. If none of these options is available, rely on
|
|
|
|
backend (currently only pydub is supported) to load data.
|
|
|
|
min_dur : float, default: 0.2
|
|
|
|
minimun duration in seconds of a detected audio event. By using large
|
|
|
|
values for `min_dur`, very short audio events (e.g., very short 1-word
|
|
|
|
utterances like 'yes' or 'no') can be mis detected. Using very short
|
|
|
|
values might result in a high number of short, unuseful audio events.
|
|
|
|
max_dur : float, default: 5
|
|
|
|
maximum duration in seconds of a detected audio event. If an audio event
|
|
|
|
lasts more than `max_dur` it will be truncated. If the continuation of a
|
|
|
|
truncated audio event is shorter than `min_dur` then this continuation
|
|
|
|
is accepted as a valid audio event if `strict_min_dur` is False.
|
|
|
|
Otherwise it is rejected.
|
|
|
|
max_silence : float, default: 0.3
|
|
|
|
maximum duration of continuous silence within an audio event. There
|
|
|
|
might be many silent gaps of this duration within one audio event. If
|
|
|
|
the continuous silence happens at the end of the event than it's kept as
|
|
|
|
part of the event if `drop_trailing_silence` is False (default).
|
|
|
|
drop_trailing_silence : bool, default: False
|
|
|
|
Whether to remove trailing silence from detected events. To avoid abrupt
|
|
|
|
cuts in speech, trailing silence should be kept, therefore this
|
|
|
|
parameter should be False.
|
|
|
|
strict_min_dur : bool, default: False
|
|
|
|
strict minimum duration. Do not accept an audio event if it is shorter
|
|
|
|
than `min_dur` even if it is contiguous to the latest valid event. This
|
|
|
|
happens if the the latest detected event had reached `max_dur`.
|
|
|
|
|
|
|
|
Other Parameters
|
|
|
|
----------------
|
|
|
|
analysis_window, aw : float, default: 0.05 (50 ms)
|
|
|
|
duration of analysis window in seconds. A value between 0.01 (10 ms) and
|
|
|
|
0.1 (100 ms) should be good for most use-cases.
|
|
|
|
audio_format, fmt : str
|
|
|
|
type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
|
|
|
|
used if `input` is a string path to an audio file. If not given, audio
|
|
|
|
type will be guessed from file name extension or from file header.
|
|
|
|
sampling_rate, sr : int
|
|
|
|
sampling rate of audio data. Required if `input` is a raw audio file, is
|
|
|
|
a bytes object or None (i.e., read from microphone).
|
|
|
|
sample_width, sw : int
|
|
|
|
number of bytes used to encode one audio sample, typically 1, 2 or 4.
|
|
|
|
Required for raw data, see `sampling_rate`.
|
|
|
|
channels, ch : int
|
|
|
|
number of channels of audio data. Required for raw data, see
|
|
|
|
`sampling_rate`.
|
|
|
|
use_channel, uc : {None, "mix"} or int
|
|
|
|
which channel to use for split if `input` has multiple audio channels.
|
|
|
|
Regardless of which channel is used for splitting, returned audio events
|
|
|
|
contain data from *all* channels, just as `input`.
|
|
|
|
The following values are accepted:
|
|
|
|
|
|
|
|
- None (alias "any"): accept audio activity from any channel, even if
|
|
|
|
other channels are silent. This is the default behavior.
|
|
|
|
|
|
|
|
- "mix" ("avg" or "average"): mix down all channels (i.e. compute
|
|
|
|
average channel) and split the resulting channel.
|
|
|
|
|
|
|
|
- int (0 <=, > `channels`): use one channel, specified by integer id,
|
|
|
|
for split.
|
|
|
|
|
|
|
|
large_file : bool, default: False
|
|
|
|
If True, AND if `input` is a path to a *wav* of a *raw* audio file
|
|
|
|
(and only these two formats) then audio data is lazily loaded to memory
|
|
|
|
(i.e., one analysis window a time). Otherwise the whole file is loaded
|
|
|
|
to memory before split. Set to True if the size of the file is larger
|
|
|
|
than available memory.
|
|
|
|
max_read, mr : float, default: None, read until end of stream
|
|
|
|
maximum data to read from source in seconds.
|
|
|
|
validator, val : callable, DataValidator
|
|
|
|
custom data validator. If `None` (default), an `AudioEnergyValidor` is
|
|
|
|
used with the given energy threshold. Can be a callable or an instance
|
|
|
|
of `DataValidator` that implements `is_valid`. In either case, it'll be
|
|
|
|
called with with a window of audio data as the first parameter.
|
|
|
|
energy_threshold, eth : float, default: 50
|
|
|
|
energy threshold for audio activity detection. Audio regions that have
|
|
|
|
enough windows of with a signal energy equal to or above this threshold
|
|
|
|
are considered valid audio events. Here we are referring to this amount
|
|
|
|
as the energy of the signal but to be more accurate, it is the log
|
|
|
|
energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
|
|
|
|
:class:`AudioEnergyValidator` and
|
|
|
|
:func:`calculate_energy_single_channel`). If `validator` is given, this
|
|
|
|
argument is ignored.
|
|
|
|
|
|
|
|
Yields
|
|
|
|
------
|
|
|
|
AudioRegion
|
|
|
|
a generator of detected :class:`AudioRegion` s.
|
|
|
|
"""
|
|
|
|
if min_dur <= 0:
|
|
|
|
raise ValueError("'min_dur' ({}) must be > 0".format(min_dur))
|
|
|
|
if max_dur <= 0:
|
|
|
|
raise ValueError("'max_dur' ({}) must be > 0".format(max_dur))
|
|
|
|
if max_silence < 0:
|
|
|
|
raise ValueError("'max_silence' ({}) must be >= 0".format(max_silence))
|
|
|
|
|
|
|
|
if isinstance(input, AudioReader):
|
|
|
|
source = input
|
|
|
|
analysis_window = source.block_dur
|
|
|
|
else:
|
|
|
|
analysis_window = kwargs.get(
|
|
|
|
"analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
|
|
|
|
)
|
|
|
|
if analysis_window <= 0:
|
|
|
|
raise ValueError(
|
|
|
|
"'analysis_window' ({}) must be > 0".format(analysis_window)
|
|
|
|
)
|
|
|
|
|
|
|
|
params = kwargs.copy()
|
|
|
|
params["max_read"] = params.get("max_read", params.get("mr"))
|
|
|
|
params["audio_format"] = params.get("audio_format", params.get("fmt"))
|
|
|
|
if isinstance(input, AudioRegion):
|
|
|
|
params["sampling_rate"] = input.sr
|
|
|
|
params["sample_width"] = input.sw
|
|
|
|
params["channels"] = input.ch
|
|
|
|
input = bytes(input)
|
|
|
|
try:
|
|
|
|
source = AudioReader(input, block_dur=analysis_window, **params)
|
|
|
|
except TooSamllBlockDuration as exc:
|
|
|
|
err_msg = "Too small 'analysis_windows' ({0}) for sampling rate "
|
|
|
|
err_msg += "({1}). Analysis windows should at least be 1/{1} to "
|
|
|
|
err_msg += "cover one single data sample"
|
|
|
|
raise ValueError(err_msg.format(exc.block_dur, exc.sampling_rate))
|
|
|
|
|
|
|
|
validator = kwargs.get("validator", kwargs.get("val"))
|
|
|
|
if validator is None:
|
|
|
|
energy_threshold = kwargs.get(
|
|
|
|
"energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
|
|
|
|
)
|
|
|
|
use_channel = kwargs.get("use_channel", kwargs.get("uc"))
|
|
|
|
validator = AudioEnergyValidator(
|
|
|
|
energy_threshold, source.sw, source.ch, use_channel=use_channel
|
|
|
|
)
|
|
|
|
mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
|
|
|
|
if strict_min_dur:
|
|
|
|
mode |= StreamTokenizer.STRICT_MIN_LENGTH
|
|
|
|
min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
|
|
|
|
max_length = _duration_to_nb_windows(
|
|
|
|
max_dur, analysis_window, math.floor, _EPSILON
|
|
|
|
)
|
|
|
|
max_continuous_silence = _duration_to_nb_windows(
|
|
|
|
max_silence, analysis_window, math.floor, _EPSILON
|
|
|
|
)
|
|
|
|
|
|
|
|
err_msg = "({0} sec.) results in {1} analysis window(s) "
|
|
|
|
err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
|
|
|
|
err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
|
|
|
|
if min_length > max_length:
|
|
|
|
err_msg = "'min_dur' " + err_msg
|
|
|
|
raise ValueError(
|
|
|
|
err_msg.format(
|
|
|
|
min_dur,
|
|
|
|
min_length,
|
|
|
|
analysis_window,
|
|
|
|
max_length,
|
|
|
|
max_dur,
|
|
|
|
"higher than",
|
|
|
|
"ceil",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
if max_continuous_silence >= max_length:
|
|
|
|
err_msg = "'max_silence' " + err_msg
|
|
|
|
raise ValueError(
|
|
|
|
err_msg.format(
|
|
|
|
max_silence,
|
|
|
|
max_continuous_silence,
|
|
|
|
analysis_window,
|
|
|
|
max_length,
|
|
|
|
max_dur,
|
|
|
|
"higher or equal to",
|
|
|
|
"floor",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
tokenizer = StreamTokenizer(
|
|
|
|
validator, min_length, max_length, max_continuous_silence, mode=mode
|
|
|
|
)
|
|
|
|
source.open()
|
|
|
|
token_gen = tokenizer.tokenize(source, generator=True)
|
|
|
|
region_gen = (
|
|
|
|
_make_audio_region(
|
|
|
|
token[0],
|
|
|
|
token[1],
|
|
|
|
source.block_dur,
|
|
|
|
source.sr,
|
|
|
|
source.sw,
|
|
|
|
source.ch,
|
|
|
|
)
|
|
|
|
for token in token_gen
|
|
|
|
)
|
|
|
|
return region_gen
|
|
|
|
|
|
|
|
|
|
|
|
def _duration_to_nb_windows(
|
|
|
|
duration, analysis_window, round_fn=round, epsilon=0
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
Converts a given duration into a positive integer of analysis windows.
|
|
|
|
if `duration / analysis_window` is not an integer, the result will be
|
|
|
|
rounded to the closest bigger integer. If `duration == 0`, returns `0`.
|
|
|
|
If `duration < analysis_window`, returns 1.
|
|
|
|
`duration` and `analysis_window` can be in seconds or milliseconds but
|
|
|
|
must be in the same unit.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
duration : float
|
|
|
|
a given duration in seconds or ms.
|
|
|
|
analysis_window: float
|
|
|
|
size of analysis window, in the same unit as `duration`.
|
|
|
|
round_fn : callable
|
|
|
|
function called to round the result. Default: `round`.
|
|
|
|
epsilon : float
|
|
|
|
small value to add to the division result before rounding.
|
|
|
|
E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
|
|
|
|
`round_fn=math.floor` returns `2` instead of `3`. Adding a small value
|
|
|
|
to `0.3 / 0.1` avoids this error.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
nb_windows : int
|
|
|
|
minimum number of `analysis_window`'s to cover `durartion`. That means
|
|
|
|
that `analysis_window * nb_windows >= duration`.
|
|
|
|
"""
|
|
|
|
if duration < 0 or analysis_window <= 0:
|
|
|
|
err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
|
|
|
|
raise ValueError(err_msg.format(duration, analysis_window))
|
|
|
|
if duration == 0:
|
|
|
|
return 0
|
|
|
|
return int(round_fn(duration / analysis_window + epsilon))
|
|
|
|
|
|
|
|
|
|
|
|
def _make_audio_region(
|
|
|
|
data_frames,
|
|
|
|
start_frame,
|
|
|
|
frame_duration,
|
|
|
|
sampling_rate,
|
|
|
|
sample_width,
|
|
|
|
channels,
|
|
|
|
):
|
|
|
|
"""
|
|
|
|
Helper function to create an `AudioRegion` from parameters returned by
|
|
|
|
tokenization object. It takes care of setting up region `start` and `end`
|
|
|
|
in metadata.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
frame_duration: float
|
|
|
|
duration of analysis window in seconds
|
|
|
|
start_frame : int
|
|
|
|
index of the fisrt analysis window
|
|
|
|
samling_rate : int
|
|
|
|
sampling rate of audio data
|
|
|
|
sample_width : int
|
|
|
|
number of bytes of one audio sample
|
|
|
|
channels : int
|
|
|
|
number of channels of audio data
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
audio_region : AudioRegion
|
|
|
|
AudioRegion whose start time is calculeted as:
|
|
|
|
`1000 * start_frame * frame_duration`
|
|
|
|
"""
|
|
|
|
start = start_frame * frame_duration
|
|
|
|
data = b"".join(data_frames)
|
|
|
|
duration = len(data) / (sampling_rate * sample_width * channels)
|
|
|
|
meta = {"start": start, "end": start + duration}
|
|
|
|
return AudioRegion(data, sampling_rate, sample_width, channels, meta)
|
|
|
|
|
|
|
|
|
|
|
|
def _read_chunks_online(max_read, **kwargs):
|
|
|
|
"""
|
|
|
|
Helper function to read audio data from an online blocking source
|
|
|
|
(i.e., microphone). Used to build an `AudioRegion` and can intercept
|
|
|
|
KeyboardInterrupt so that reading stops as soon as this exception is
|
|
|
|
raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
|
|
|
|
notebooks more user friendly.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
max_read : float
|
|
|
|
maximum amount of data to read in seconds.
|
|
|
|
kwargs :
|
|
|
|
audio parameters (sampling_rate, sample_width and channels).
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
`AudioRegion.build`
|
|
|
|
"""
|
|
|
|
reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
|
|
|
|
reader.open()
|
|
|
|
data = []
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
frame = reader.read()
|
|
|
|
if frame is None:
|
|
|
|
break
|
|
|
|
data.append(frame)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
# Stop data acquisition from microphone when pressing
|
|
|
|
# Ctrl+C on a [i]python session or a notebook
|
|
|
|
pass
|
|
|
|
reader.close()
|
|
|
|
return (
|
|
|
|
b"".join(data),
|
|
|
|
reader.sampling_rate,
|
|
|
|
reader.sample_width,
|
|
|
|
reader.channels,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _read_offline(input, skip=0, max_read=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Helper function to read audio data from an offline (i.e., file). Used to
|
|
|
|
build `AudioRegion`s.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
input : str, bytes
|
|
|
|
path to audio file (if str), or a bytes object representing raw audio
|
|
|
|
data.
|
|
|
|
skip : float, default 0
|
|
|
|
amount of data to skip from the begining of audio source.
|
|
|
|
max_read : float, default: None
|
|
|
|
maximum amount of audio data to read. Default: None, means read until
|
|
|
|
end of stream.
|
|
|
|
kwargs :
|
|
|
|
audio parameters (sampling_rate, sample_width and channels).
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
`AudioRegion.build`
|
|
|
|
|
|
|
|
"""
|
|
|
|
audio_source = get_audio_source(input, **kwargs)
|
|
|
|
audio_source.open()
|
|
|
|
if skip is not None and skip > 0:
|
|
|
|
skip_samples = round(skip * audio_source.sampling_rate)
|
|
|
|
audio_source.read(skip_samples)
|
|
|
|
if max_read is not None:
|
|
|
|
if max_read < 0:
|
|
|
|
max_read = None
|
|
|
|
else:
|
|
|
|
max_read = round(max_read * audio_source.sampling_rate)
|
|
|
|
data = audio_source.read(max_read)
|
|
|
|
audio_source.close()
|
|
|
|
return (
|
|
|
|
data,
|
|
|
|
audio_source.sampling_rate,
|
|
|
|
audio_source.sample_width,
|
|
|
|
audio_source.channels,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _check_convert_index(index, types, err_msg):
|
|
|
|
if not isinstance(index, slice) or index.step is not None:
|
|
|
|
raise TypeError(err_msg)
|
|
|
|
start = index.start if index.start is not None else 0
|
|
|
|
stop = index.stop
|
|
|
|
for index in (start, stop):
|
|
|
|
if index is not None and not isinstance(index, types):
|
|
|
|
raise TypeError(err_msg)
|
|
|
|
return start, stop
|
|
|
|
|
|
|
|
|
|
|
|
class _SecondsView:
|
|
|
|
"""A class to create a view of `AudioRegion` that can be sliced using
|
|
|
|
indices in seconds.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, region):
|
|
|
|
self._region = region
|
|
|
|
|
|
|
|
def __getitem__(self, index):
|
|
|
|
err_msg = "Slicing AudioRegion by seconds requires indices of type "
|
|
|
|
err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
|
|
|
|
start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
|
|
|
|
sr = self._region.sampling_rate
|
|
|
|
start_sample = int(start_s * sr)
|
|
|
|
stop_sample = None if stop_s is None else round(stop_s * sr)
|
|
|
|
return self._region[start_sample:stop_sample]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def len(self):
|
|
|
|
"""
|
|
|
|
Return region duration in seconds.
|
|
|
|
"""
|
|
|
|
return self._region.duration
|
|
|
|
|
|
|
|
|
|
|
|
class _MillisView(_SecondsView):
|
|
|
|
"""A class to create a view of `AudioRegion` that can be sliced using
|
|
|
|
indices in milliseconds.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __getitem__(self, index):
|
|
|
|
err_msg = (
|
|
|
|
"Slicing AudioRegion by milliseconds requires indices of type "
|
|
|
|
)
|
|
|
|
err_msg += "'int' without a step (e.g. region.sec[500:1500])"
|
|
|
|
start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
|
|
|
|
start_sec = start_ms / 1000
|
|
|
|
stop_sec = None if stop_ms is None else stop_ms / 1000
|
|
|
|
index = slice(start_sec, stop_sec)
|
|
|
|
return super(_MillisView, self).__getitem__(index)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""
|
|
|
|
Return region duration in milliseconds.
|
|
|
|
"""
|
|
|
|
return round(self._region.duration * 1000)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def len(self):
|
|
|
|
"""
|
|
|
|
Return region duration in milliseconds.
|
|
|
|
"""
|
|
|
|
return len(self)
|
|
|
|
|
|
|
|
|
|
|
|
class _AudioRegionMetadata(dict):
|
|
|
|
"""A class to store `AudioRegion`'s metadata."""
|
|
|
|
|
|
|
|
def __getattr__(self, name):
|
|
|
|
if name in self:
|
|
|
|
return self[name]
|
|
|
|
else:
|
|
|
|
err_msg = "AudioRegion metadata has no entry '{}'"
|
|
|
|
raise AttributeError(err_msg.format(name))
|
|
|
|
|
|
|
|
def __setattr__(self, name, value):
|
|
|
|
self[name] = value
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return "\n".join("{}: {}".format(k, v) for k, v in self.items())
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return str(self)
|
|
|
|
|
|
|
|
|
|
|
|
class AudioRegion(object):
|
|
|
|
"""
|
|
|
|
AudioRegion encapsulates raw audio data and provides an interface to
|
|
|
|
perform simple operations on it. Use `AudioRegion.load` to build an
|
|
|
|
`AudioRegion` from different types of objects.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
data : bytes
|
|
|
|
raw audio data as a bytes object
|
|
|
|
sampling_rate : int
|
|
|
|
sampling rate of audio data
|
|
|
|
sample_width : int
|
|
|
|
number of bytes of one audio sample
|
|
|
|
channels : int
|
|
|
|
number of channels of audio data
|
|
|
|
meta : dict, default: None
|
|
|
|
any collection of <key:value> elements used to build metadata for
|
|
|
|
this `AudioRegion`. Meta data can be accessed via `region.meta.key`
|
|
|
|
if `key` is a valid python attribute name, or via `region.meta[key]`
|
|
|
|
if not. Note that the :func:`split` function (or the
|
|
|
|
:meth:`AudioRegion.split` method) returns `AudioRegions` with a ``start``
|
|
|
|
and a ``stop`` meta values that indicate the location in seconds of the
|
|
|
|
region in original audio data.
|
|
|
|
|
|
|
|
See also
|
|
|
|
--------
|
|
|
|
AudioRegion.load
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, data, sampling_rate, sample_width, channels, meta=None):
|
|
|
|
check_audio_data(data, sample_width, channels)
|
|
|
|
self._data = data
|
|
|
|
self._sampling_rate = sampling_rate
|
|
|
|
self._sample_width = sample_width
|
|
|
|
self._channels = channels
|
|
|
|
self._samples = None
|
|
|
|
self.splitp = self.split_and_plot
|
|
|
|
|
|
|
|
if meta is not None:
|
|
|
|
self._meta = _AudioRegionMetadata(meta)
|
|
|
|
else:
|
|
|
|
self._meta = None
|
|
|
|
|
|
|
|
self._seconds_view = _SecondsView(self)
|
|
|
|
self.sec = self.seconds
|
|
|
|
self.s = self.seconds
|
|
|
|
|
|
|
|
self._millis_view = _MillisView(self)
|
|
|
|
self.ms = self.millis
|
|
|
|
|
|
|
|
@property
|
|
|
|
def meta(self):
|
|
|
|
return self._meta
|
|
|
|
|
|
|
|
@meta.setter
|
|
|
|
def meta(self, new_meta):
|
|
|
|
"""Meta data of audio region."""
|
|
|
|
self._meta = _AudioRegionMetadata(new_meta)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def load(cls, input, skip=0, max_read=None, **kwargs):
|
|
|
|
"""
|
|
|
|
Create an `AudioRegion` by loading data from `input`. See :func:`load`
|
|
|
|
for parameters descripion.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
region: AudioRegion
|
|
|
|
|
|
|
|
Raises
|
|
|
|
------
|
|
|
|
ValueError
|
|
|
|
raised if `input` is None and `skip` != 0 or `max_read` is None.
|
|
|
|
"""
|
|
|
|
if input is None:
|
|
|
|
if skip > 0:
|
|
|
|
raise ValueError(
|
|
|
|
"'skip' should be 0 when reading from microphone"
|
|
|
|
)
|
|
|
|
if max_read is None or max_read < 0:
|
|
|
|
raise ValueError(
|
|
|
|
"'max_read' should not be None when reading from "
|
|
|
|
"microphone"
|
|
|
|
)
|
|
|
|
data, sampling_rate, sample_width, channels = _read_chunks_online(
|
|
|
|
max_read, **kwargs
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
data, sampling_rate, sample_width, channels = _read_offline(
|
|
|
|
input, skip=skip, max_read=max_read, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return cls(data, sampling_rate, sample_width, channels)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def seconds(self):
|
|
|
|
"""
|
|
|
|
A view to slice audio region by seconds (using ``region.seconds[start:end]``).
|
|
|
|
"""
|
|
|
|
return self._seconds_view
|
|
|
|
|
|
|
|
@property
|
|
|
|
def millis(self):
|
|
|
|
"""A view to slice audio region by milliseconds (using ``region.millis[start:end]``)."""
|
|
|
|
return self._millis_view
|
|
|
|
|
|
|
|
@property
|
|
|
|
def duration(self):
|
|
|
|
"""
|
|
|
|
Returns region duration in seconds.
|
|
|
|
"""
|
|
|
|
return len(self._data) / (
|
|
|
|
self.sampling_rate * self.sample_width * self.channels
|
|
|
|
)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def sampling_rate(self):
|
|
|
|
"""Samling rate of audio data."""
|
|
|
|
return self._sampling_rate
|
|
|
|
|
|
|
|
@property
|
|
|
|
def sr(self):
|
|
|
|
"""Samling rate of audio data, alias for `sampling_rate`."""
|
|
|
|
return self._sampling_rate
|
|
|
|
|
|
|
|
@property
|
|
|
|
def sample_width(self):
|
|
|
|
"""Number of bytes per sample, one channel considered."""
|
|
|
|
return self._sample_width
|
|
|
|
|
|
|
|
@property
|
|
|
|
def sw(self):
|
|
|
|
"""Number of bytes per sample, alias for `sampling_rate`."""
|
|
|
|
return self._sample_width
|
|
|
|
|
|
|
|
@property
|
|
|
|
def channels(self):
|
|
|
|
"""Number of channels of audio data."""
|
|
|
|
return self._channels
|
|
|
|
|
|
|
|
@property
|
|
|
|
def ch(self):
|
|
|
|
"""Number of channels of audio data, alias for `channels`."""
|
|
|
|
return self._channels
|
|
|
|
|
|
|
|
def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
|
|
|
|
"""
|
|
|
|
Play audio region.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
progress_bar : bool, default: False
|
|
|
|
whether to use a progress bar while playing audio. Default: False.
|
|
|
|
`progress_bar` requires `tqdm`, if not installed, no progress bar
|
|
|
|
will be shown.
|
|
|
|
player : AudioPalyer, default: None
|
|
|
|
audio player to use. if None (default), use `player_for()`
|
|
|
|
to get a new audio player.
|
|
|
|
progress_bar_kwargs : kwargs
|
|
|
|
keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
|
|
|
|
use `leave=False` to clean up the screen when play finishes).
|
|
|
|
"""
|
|
|
|
if player is None:
|
|
|
|
player = player_for(self)
|
|
|
|
player.play(
|
|
|
|
self._data, progress_bar=progress_bar, **progress_bar_kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
def save(self, file, audio_format=None, exists_ok=True, **audio_parameters):
|
|
|
|
"""
|
|
|
|
Save audio region to file.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
file : str
|
|
|
|
path to output audio file. May contain `{duration}` placeholder
|
|
|
|
as well as any place holder that this region's metadata might
|
|
|
|
contain (e.g., regions returned by `split` contain metadata with
|
|
|
|
`start` and `end` attributes that can be used to build output file
|
|
|
|
name as `{meta.start}` and `{meta.end}`. See examples using
|
|
|
|
placeholders with formatting.
|
|
|
|
|
|
|
|
audio_format : str, default: None
|
|
|
|
format used to save audio data. If None (default), format is guessed
|
|
|
|
from file name's extension. If file name has no extension, audio
|
|
|
|
data is saved as a raw (headerless) audio file.
|
|
|
|
exists_ok : bool, default: True
|
|
|
|
If True, overwrite `file` if a file with the same name exists.
|
|
|
|
If False, raise an `IOError` if `file` exists.
|
|
|
|
audio_parameters: dict
|
|
|
|
any keyword arguments to be passed to audio saving backend.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
file: str
|
|
|
|
name of output file with replaced placehoders.
|
|
|
|
Raises
|
|
|
|
IOError if `file` exists and `exists_ok` is False.
|
|
|
|
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
>>> region = AudioRegion(b'\\0' * 2 * 24000,
|
|
|
|
>>> sampling_rate=16000,
|
|
|
|
>>> sample_width=2,
|
|
|
|
>>> channels=1)
|
|
|
|
>>> region.meta.start = 2.25
|
|
|
|
>>> region.meta.end = 2.25 + region.duration
|
|
|
|
>>> region.save('audio_{meta.start}-{meta.end}.wav')
|
|
|
|
>>> audio_2.25-3.75.wav
|
|
|
|
>>> region.save('region_{meta.start:.3f}_{duration:.3f}.wav')
|
|
|
|
audio_2.250_1.500.wav
|
|
|
|
"""
|
|
|
|
if isinstance(file, str):
|
|
|
|
file = file.format(duration=self.duration, meta=self.meta)
|
|
|
|
if not exists_ok and os.path.exists(file):
|
|
|
|
raise FileExistsError("file '{file}' exists".format(file=file))
|
|
|
|
to_file(
|
|
|
|
self._data,
|
|
|
|
file,
|
|
|
|
audio_format,
|
|
|
|
sr=self.sr,
|
|
|
|
sw=self.sw,
|
|
|
|
ch=self.ch,
|
|
|
|
audio_parameters=audio_parameters,
|
|
|
|
)
|
|
|
|
return file
|
|
|
|
|
|
|
|
def split(
|
|
|
|
self,
|
|
|
|
min_dur=0.2,
|
|
|
|
max_dur=5,
|
|
|
|
max_silence=0.3,
|
|
|
|
drop_trailing_silence=False,
|
|
|
|
strict_min_dur=False,
|
|
|
|
**kwargs
|
|
|
|
):
|
|
|
|
"""Split audio region. See :func:`auditok.split()` for a comprehensive
|
|
|
|
description of split parameters.
|
|
|
|
See Also :meth:`AudioRegio.split_and_plot`.
|
|
|
|
"""
|
|
|
|
if kwargs.get("max_read", kwargs.get("mr")) is not None:
|
|
|
|
warn_msg = "'max_read' (or 'mr') should not be used with "
|
|
|
|
warn_msg += "AudioRegion.split_and_plot(). You should rather "
|
|
|
|
warn_msg += "slice audio region before calling this method"
|
|
|
|
raise RuntimeWarning(warn_msg)
|
|
|
|
return split(
|
|
|
|
self,
|
|
|
|
min_dur=min_dur,
|
|
|
|
max_dur=max_dur,
|
|
|
|
max_silence=max_silence,
|
|
|
|
drop_trailing_silence=drop_trailing_silence,
|
|
|
|
strict_min_dur=strict_min_dur,
|
|
|
|
**kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
def plot(
|
|
|
|
self,
|
|
|
|
scale_signal=True,
|
|
|
|
show=True,
|
|
|
|
figsize=None,
|
|
|
|
save_as=None,
|
|
|
|
dpi=120,
|
|
|
|
theme="auditok",
|
|
|
|
):
|
|
|
|
"""Plot audio region, one sub-plot for each channel.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
scale_signal : bool, default: True
|
|
|
|
if true, scale signal by subtracting its mean and dividing by its
|
|
|
|
standard deviation before plotting.
|
|
|
|
show : bool
|
|
|
|
whether to show plotted signal right after the call.
|
|
|
|
figsize : tuple, default: None
|
|
|
|
width and height of the figure to pass to `matplotlib`.
|
|
|
|
save_as : str, default None.
|
|
|
|
if provided, also save plot to file.
|
|
|
|
dpi : int, default: 120
|
|
|
|
plot dpi to pass to `matplotlib`.
|
|
|
|
theme : str or dict, default: "auditok"
|
|
|
|
plot theme to use. Currently only "auditok" theme is implemented. To
|
|
|
|
provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
from auditok.plotting import plot
|
|
|
|
|
|
|
|
plot(
|
|
|
|
self,
|
|
|
|
scale_signal=scale_signal,
|
|
|
|
show=show,
|
|
|
|
figsize=figsize,
|
|
|
|
save_as=save_as,
|
|
|
|
dpi=dpi,
|
|
|
|
theme=theme,
|
|
|
|
)
|
|
|
|
except ImportError:
|
|
|
|
raise RuntimeWarning("Plotting requires matplotlib")
|
|
|
|
|
|
|
|
def split_and_plot(
|
|
|
|
self,
|
|
|
|
min_dur=0.2,
|
|
|
|
max_dur=5,
|
|
|
|
max_silence=0.3,
|
|
|
|
drop_trailing_silence=False,
|
|
|
|
strict_min_dur=False,
|
|
|
|
scale_signal=True,
|
|
|
|
show=True,
|
|
|
|
figsize=None,
|
|
|
|
save_as=None,
|
|
|
|
dpi=120,
|
|
|
|
theme="auditok",
|
|
|
|
**kwargs
|
|
|
|
):
|
|
|
|
"""Split region and plot signal and detections. Alias: :meth:`splitp`.
|
|
|
|
See :func:`auditok.split()` for a comprehensive description of split
|
|
|
|
parameters. Also see :meth:`plot` for plot parameters.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
from auditok.plotting import plot
|
|
|
|
|
|
|
|
regions = self.split(
|
|
|
|
min_dur=min_dur,
|
|
|
|
max_dur=max_dur,
|
|
|
|
max_silence=max_silence,
|
|
|
|
drop_trailing_silence=drop_trailing_silence,
|
|
|
|
strict_min_dur=strict_min_dur,
|
|
|
|
**kwargs
|
|
|
|
)
|
|
|
|
regions = list(regions)
|
|
|
|
detections = ((reg.meta.start, reg.meta.end) for reg in regions)
|
|
|
|
eth = kwargs.get(
|
|
|
|
"energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
|
|
|
|
)
|
|
|
|
plot(
|
|
|
|
self,
|
|
|
|
scale_signal=scale_signal,
|
|
|
|
detections=detections,
|
|
|
|
energy_threshold=eth,
|
|
|
|
show=show,
|
|
|
|
figsize=figsize,
|
|
|
|
save_as=save_as,
|
|
|
|
dpi=dpi,
|
|
|
|
theme=theme,
|
|
|
|
)
|
|
|
|
return regions
|
|
|
|
except ImportError:
|
|
|
|
raise RuntimeWarning("Plotting requires matplotlib")
|
|
|
|
|
|
|
|
def __array__(self):
|
|
|
|
return self.samples
|
|
|
|
|
|
|
|
@property
|
|
|
|
def samples(self):
|
|
|
|
"""Audio region as arrays of samples, one array per channel."""
|
|
|
|
if self._samples is None:
|
|
|
|
self._samples = signal.to_array(
|
|
|
|
self._data, self.sample_width, self.channels
|
|
|
|
)
|
|
|
|
return self._samples
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
"""
|
|
|
|
Return region length in number of samples.
|
|
|
|
"""
|
|
|
|
return len(self._data) // (self.sample_width * self.channels)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def len(self):
|
|
|
|
"""
|
|
|
|
Return region length in number of samples.
|
|
|
|
"""
|
|
|
|
return len(self)
|
|
|
|
|
|
|
|
def __bytes__(self):
|
|
|
|
return self._data
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return (
|
|
|
|
"AudioRegion(duration={:.3f}, "
|
|
|
|
"sampling_rate={}, sample_width={}, channels={})".format(
|
|
|
|
self.duration, self.sr, self.sw, self.ch
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return str(self)
|
|
|
|
|
|
|
|
def __add__(self, other):
|
|
|
|
"""
|
|
|
|
Concatenates this region and `other` and return a new region.
|
|
|
|
Both regions must have the same sampling rate, sample width
|
|
|
|
and number of channels. If not, raises a `ValueError`.
|
|
|
|
"""
|
|
|
|
if not isinstance(other, AudioRegion):
|
|
|
|
raise TypeError(
|
|
|
|
"Can only concatenate AudioRegion, "
|
|
|
|
'not "{}"'.format(type(other))
|
|
|
|
)
|
|
|
|
if other.sr != self.sr:
|
|
|
|
raise ValueError(
|
|
|
|
"Can only concatenate AudioRegions of the same "
|
|
|
|
"sampling rate ({} != {})".format(self.sr, other.sr)
|
|
|
|
)
|
|
|
|
if other.sw != self.sw:
|
|
|
|
raise ValueError(
|
|
|
|
"Can only concatenate AudioRegions of the same "
|
|
|
|
"sample width ({} != {})".format(self.sw, other.sw)
|
|
|
|
)
|
|
|
|
if other.ch != self.ch:
|
|
|
|
raise ValueError(
|
|
|
|
"Can only concatenate AudioRegions of the same "
|
|
|
|
"number of channels ({} != {})".format(self.ch, other.ch)
|
|
|
|
)
|
|
|
|
data = self._data + other._data
|
|
|
|
return AudioRegion(data, self.sr, self.sw, self.ch)
|
|
|
|
|
|
|
|
def __radd__(self, other):
|
|
|
|
"""
|
|
|
|
Concatenates `other` and this region. `other` should be an
|
|
|
|
`AudioRegion` with the same audio parameters as this region
|
|
|
|
but can exceptionally be `0` to make it possible to concatenate
|
|
|
|
many regions with `sum`.
|
|
|
|
"""
|
|
|
|
if other == 0:
|
|
|
|
return self
|
|
|
|
return other.add(self)
|
|
|
|
|
|
|
|
def __mul__(self, n):
|
|
|
|
if not isinstance(n, int):
|
|
|
|
err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
|
|
|
|
raise TypeError(err_msg.format(type(n)))
|
|
|
|
data = self._data * n
|
|
|
|
return AudioRegion(data, self.sr, self.sw, self.ch)
|
|
|
|
|
|
|
|
def __rmul__(self, n):
|
|
|
|
return self * n
|
|
|
|
|
|
|
|
def __truediv__(self, n):
|
|
|
|
if not isinstance(n, int) or n <= 0:
|
|
|
|
raise TypeError("AudioRegion can only be divided by a positive int")
|
|
|
|
samples_per_sub_region, rest = divmod(len(self), n)
|
|
|
|
onset = 0
|
|
|
|
sub_regions = []
|
|
|
|
while onset < len(self):
|
|
|
|
offset = 0
|
|
|
|
if rest > 0:
|
|
|
|
offset = 1
|
|
|
|
rest -= 1
|
|
|
|
offset += onset + samples_per_sub_region
|
|
|
|
sub_regions.append(self[onset:offset])
|
|
|
|
onset = offset
|
|
|
|
return sub_regions
|
|
|
|
|
|
|
|
def __eq__(self, other):
|
|
|
|
if other is self:
|
|
|
|
return True
|
|
|
|
if not isinstance(other, AudioRegion):
|
|
|
|
return False
|
|
|
|
return (
|
|
|
|
(self._data == other._data)
|
|
|
|
and (self.sr == other.sr)
|
|
|
|
and (self.sw == other.sw)
|
|
|
|
and (self.ch == other.ch)
|
|
|
|
)
|
|
|
|
|
|
|
|
def __getitem__(self, index):
|
|
|
|
err_msg = "Slicing AudioRegion by samples requires indices of type "
|
|
|
|
err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
|
|
|
|
start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
|
|
|
|
|
|
|
|
bytes_per_sample = self.sample_width * self.channels
|
|
|
|
len_samples = len(self._data) // bytes_per_sample
|
|
|
|
|
|
|
|
if start_sample < 0:
|
|
|
|
start_sample = max(start_sample + len_samples, 0)
|
|
|
|
onset = start_sample * bytes_per_sample
|
|
|
|
|
|
|
|
if stop_sample is not None:
|
|
|
|
if stop_sample < 0:
|
|
|
|
stop_sample = max(stop_sample + len_samples, 0)
|
|
|
|
offset = index.stop * bytes_per_sample
|
|
|
|
else:
|
|
|
|
offset = None
|
|
|
|
|
|
|
|
data = self._data[onset:offset]
|
|
|
|
return AudioRegion(data, self.sr, self.sw, self.ch)
|
|
|
|
|
|
|
|
|
|
|
|
class StreamTokenizer:
|
|
|
|
"""
|
|
|
|
Class for stream tokenizers. It implements a 4-state automaton scheme
|
|
|
|
to extract sub-sequences of interest on the fly.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
validator : callable, DataValidator (must implement `is_valid`)
|
|
|
|
called with each data frame read from source. Should take one positional
|
|
|
|
argument and return True or False for valid and invalid frames
|
|
|
|
respectively.
|
|
|
|
|
|
|
|
min_length : int
|
|
|
|
Minimum number of frames of a valid token. This includes all
|
|
|
|
tolerated non valid frames within the token.
|
|
|
|
|
|
|
|
max_length : int
|
|
|
|
Maximum number of frames of a valid token. This includes all
|
|
|
|
tolerated non valid frames within the token.
|
|
|
|
|
|
|
|
max_continuous_silence : int
|
|
|
|
Maximum number of consecutive non-valid frames within a token.
|
|
|
|
Note that, within a valid token, there may be many tolerated
|
|
|
|
*silent* regions that contain each a number of non valid frames up
|
|
|
|
to `max_continuous_silence`
|
|
|
|
|
|
|
|
init_min : int
|
|
|
|
Minimum number of consecutive valid frames that must be
|
|
|
|
**initially** gathered before any sequence of non valid frames can
|
|
|
|
be tolerated. This option is not always needed, it can be used to
|
|
|
|
drop non-valid tokens as early as possible. **Default = 0** means
|
|
|
|
that the option is by default ineffective.
|
|
|
|
|
|
|
|
init_max_silence : int
|
|
|
|
Maximum number of tolerated consecutive non-valid frames if the
|
|
|
|
number already gathered valid frames has not yet reached
|
|
|
|
'init_min'.This argument is normally used if `init_min` is used.
|
|
|
|
**Default = 0**, by default this argument is not taken into
|
|
|
|
consideration.
|
|
|
|
|
|
|
|
mode : int
|
|
|
|
mode can be one of the following:
|
|
|
|
|
|
|
|
-1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
|
|
|
|
accept a token shorter than `min_length` if it is the continuation
|
|
|
|
of the latest delivered token.
|
|
|
|
|
|
|
|
-2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
|
|
|
|
because `max_length` is reached, and token `i+1` is immediately
|
|
|
|
adjacent to token `i` (i.e. token `i` ends at frame `k` and token
|
|
|
|
`i+1` starts at frame `k+1`) then accept token `i+1` only of it has
|
|
|
|
a size of at least `min_length`. The default behavior is to accept
|
|
|
|
token `i+1` event if it is shorter than `min_length` (provided that
|
|
|
|
the above conditions are fulfilled of course).
|
|
|
|
|
|
|
|
-3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
|
|
|
|
non-valid frames from a token to be delivered if and only if it
|
|
|
|
is not **truncated**. This can be a bit tricky. A token is actually
|
|
|
|
delivered if:
|
|
|
|
|
|
|
|
- `max_continuous_silence` is reached.
|
|
|
|
|
|
|
|
- Its length reaches `max_length`. This is referred to as a
|
|
|
|
**truncated** token.
|
|
|
|
|
|
|
|
In the current implementation, a `StreamTokenizer`'s decision is only
|
|
|
|
based on already seen data and on incoming data. Thus, if a token is
|
|
|
|
truncated at a non-valid but tolerated frame (`max_length` is reached
|
|
|
|
but `max_continuous_silence` not yet) any tailing silence will be kept
|
|
|
|
because it can potentially be part of valid token (if `max_length` was
|
|
|
|
bigger). But if `max_continuous_silence` is reached before
|
|
|
|
`max_length`, the delivered token will not be considered as truncated
|
|
|
|
but a result of *normal* end of detection (i.e. no more valid data).
|
|
|
|
In that case the trailing silence can be removed if you use the
|
|
|
|
`StreamTokenizer.DROP_TRAILING_SILENCE` mode.
|
|
|
|
|
|
|
|
-4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`:
|
|
|
|
use both options. That means: first remove tailing silence, then
|
|
|
|
check if the token still has a length of at least `min_length`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
|
|
|
|
In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
|
|
|
|
accepted although it is shorter than `min_length` (3), because it
|
|
|
|
immediately follows the latest delivered token:
|
|
|
|
|
|
|
|
>>> from auditok.core import StreamTokenizer
|
|
|
|
>>> from StringDataSource, DataValidator
|
|
|
|
|
|
|
|
>>> class UpperCaseChecker(DataValidator):
|
|
|
|
>>> def is_valid(self, frame):
|
|
|
|
return frame.isupper()
|
|
|
|
>>> dsource = StringDataSource("aaaAAAABBbbb")
|
|
|
|
>>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
|
|
|
|
min_length=3,
|
|
|
|
max_length=4,
|
|
|
|
max_continuous_silence=0)
|
|
|
|
>>> tokenizer.tokenize(dsource)
|
|
|
|
[(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
|
|
|
|
|
|
|
|
|
|
|
|
The following tokenizer will however reject the 'BB' token:
|
|
|
|
|
|
|
|
>>> dsource = StringDataSource("aaaAAAABBbbb")
|
|
|
|
>>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
|
|
|
|
min_length=3, max_length=4,
|
|
|
|
max_continuous_silence=0,
|
|
|
|
mode=StreamTokenizer.STRICT_MIN_LENGTH)
|
|
|
|
>>> tokenizer.tokenize(dsource)
|
|
|
|
[(['A', 'A', 'A', 'A'], 3, 6)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>>> tokenizer = StreamTokenizer(
|
|
|
|
>>> validator=UpperCaseChecker(),
|
|
|
|
>>> min_length=3,
|
|
|
|
>>> max_length=6,
|
|
|
|
>>> max_continuous_silence=3,
|
|
|
|
>>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
|
|
|
|
>>> )
|
|
|
|
>>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
|
|
|
|
>>> tokenizer.tokenize(dsource)
|
|
|
|
[(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
|
|
|
|
|
|
|
|
The first token is delivered with its tailing silence because it is
|
|
|
|
truncated while the second one has its tailing frames removed.
|
|
|
|
|
|
|
|
Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
|
|
|
|
|
|
|
|
.. code:: python
|
|
|
|
|
|
|
|
[
|
|
|
|
(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
|
|
|
|
(['B', 'B', 'b', 'b', 'b'], 9, 13)
|
|
|
|
]
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
SILENCE = 0
|
|
|
|
POSSIBLE_SILENCE = 1
|
|
|
|
POSSIBLE_NOISE = 2
|
|
|
|
NOISE = 3
|
|
|
|
NORMAL = 0
|
|
|
|
STRICT_MIN_LENGTH = 2
|
|
|
|
DROP_TRAILING_SILENCE = 4
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
validator,
|
|
|
|
min_length,
|
|
|
|
max_length,
|
|
|
|
max_continuous_silence,
|
|
|
|
init_min=0,
|
|
|
|
init_max_silence=0,
|
|
|
|
mode=0,
|
|
|
|
):
|
|
|
|
if callable(validator):
|
|
|
|
self._is_valid = validator
|
|
|
|
elif isinstance(validator, DataValidator):
|
|
|
|
self._is_valid = validator.is_valid
|
|
|
|
else:
|
|
|
|
raise TypeError(
|
|
|
|
"'validator' must be a callable or an instance of "
|
|
|
|
"DataValidator"
|
|
|
|
)
|
|
|
|
|
|
|
|
if max_length <= 0:
|
|
|
|
raise ValueError(
|
|
|
|
"'max_length' must be > 0 (value={0})".format(max_length)
|
|
|
|
)
|
|
|
|
|
|
|
|
if min_length <= 0 or min_length > max_length:
|
|
|
|
err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
|
|
|
|
raise ValueError(err_msg.format(min_length))
|
|
|
|
|
|
|
|
if max_continuous_silence >= max_length:
|
|
|
|
err_msg = "'max_continuous_silence' must be < 'max_length' "
|
|
|
|
err_msg += "(value={0})"
|
|
|
|
raise ValueError(err_msg.format(max_continuous_silence))
|
|
|
|
|
|
|
|
if init_min >= max_length:
|
|
|
|
raise ValueError(
|
|
|
|
"'init_min' must be < 'max_length' (value={0})".format(
|
|
|
|
max_continuous_silence
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
self.validator = validator
|
|
|
|
self.min_length = min_length
|
|
|
|
self.max_length = max_length
|
|
|
|
self.max_continuous_silence = max_continuous_silence
|
|
|
|
self.init_min = init_min
|
|
|
|
self.init_max_silent = init_max_silence
|
|
|
|
self._set_mode(mode)
|
|
|
|
self._deliver = None
|
|
|
|
self._tokens = None
|
|
|
|
self._state = None
|
|
|
|
self._data = None
|
|
|
|
self._contiguous_token = False
|
|
|
|
self._init_count = 0
|
|
|
|
self._silence_length = 0
|
|
|
|
self._start_frame = 0
|
|
|
|
self._current_frame = 0
|
|
|
|
|
|
|
|
def _set_mode(self, mode):
|
|
|
|
strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
|
|
|
|
strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE
|
|
|
|
if mode not in [
|
|
|
|
StreamTokenizer.NORMAL,
|
|
|
|
StreamTokenizer.STRICT_MIN_LENGTH,
|
|
|
|
StreamTokenizer.DROP_TRAILING_SILENCE,
|
|
|
|
strict_min_and_drop_trailing,
|
|
|
|
]:
|
|
|
|
raise ValueError("Wrong value for mode")
|
|
|
|
self._mode = mode
|
|
|
|
self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
|
|
|
|
self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
|
|
|
|
|
|
|
|
def _reinitialize(self):
|
|
|
|
self._contiguous_token = False
|
|
|
|
self._data = []
|
|
|
|
self._tokens = []
|
|
|
|
self._state = self.SILENCE
|
|
|
|
self._current_frame = -1
|
|
|
|
self._deliver = self._append_token
|
|
|
|
|
|
|
|
def tokenize(self, data_source, callback=None, generator=False):
|
|
|
|
"""
|
|
|
|
Read data from `data_source`, one frame a time, and process the read
|
|
|
|
frames in order to detect sequences of frames that make up valid
|
|
|
|
tokens.
|
|
|
|
|
|
|
|
:Parameters:
|
|
|
|
`data_source` : instance of the :class:`DataSource` class that
|
|
|
|
implements a `read` method. 'read' should return a slice of
|
|
|
|
signal, i.e. frame (of whatever type as long as it can be
|
|
|
|
processed by validator) and None if there is no more signal.
|
|
|
|
|
|
|
|
`callback` : an optional 3-argument function.
|
|
|
|
If a `callback` function is given, it will be called each time
|
|
|
|
a valid token is found.
|
|
|
|
|
|
|
|
|
|
|
|
:Returns:
|
|
|
|
A list of tokens if `callback` is None. Each token is tuple with the
|
|
|
|
following elements:
|
|
|
|
|
|
|
|
.. code python
|
|
|
|
|
|
|
|
(data, start, end)
|
|
|
|
|
|
|
|
where `data` is a list of read frames, `start`: index of the first
|
|
|
|
frame in the original data and `end` : index of the last frame.
|
|
|
|
"""
|
|
|
|
token_gen = self._iter_tokens(data_source)
|
|
|
|
if callback:
|
|
|
|
for token in token_gen:
|
|
|
|
callback(*token)
|
|
|
|
return
|
|
|
|
if generator:
|
|
|
|
return token_gen
|
|
|
|
return list(token_gen)
|
|
|
|
|
|
|
|
def _iter_tokens(self, data_source):
|
|
|
|
self._reinitialize()
|
|
|
|
while True:
|
|
|
|
frame = data_source.read()
|
|
|
|
self._current_frame += 1
|
|
|
|
if frame is None:
|
|
|
|
token = self._post_process()
|
|
|
|
if token is not None:
|
|
|
|
yield token
|
|
|
|
break
|
|
|
|
token = self._process(frame)
|
|
|
|
if token is not None:
|
|
|
|
yield token
|
|
|
|
|
|
|
|
def _process(self, frame): # noqa: C901
|
|
|
|
|
|
|
|
frame_is_valid = self._is_valid(frame)
|
|
|
|
|
|
|
|
if self._state == self.SILENCE:
|
|
|
|
|
|
|
|
if frame_is_valid:
|
|
|
|
# seems we got a valid frame after a silence
|
|
|
|
self._init_count = 1
|
|
|
|
self._silence_length = 0
|
|
|
|
self._start_frame = self._current_frame
|
|
|
|
self._data.append(frame)
|
|
|
|
|
|
|
|
if self._init_count >= self.init_min:
|
|
|
|
self._state = self.NOISE
|
|
|
|
if len(self._data) >= self.max_length:
|
|
|
|
return self._process_end_of_detection(True)
|
|
|
|
else:
|
|
|
|
self._state = self.POSSIBLE_NOISE
|
|
|
|
|
|
|
|
elif self._state == self.POSSIBLE_NOISE:
|
|
|
|
|
|
|
|
if frame_is_valid:
|
|
|
|
self._silence_length = 0
|
|
|
|
self._init_count += 1
|
|
|
|
self._data.append(frame)
|
|
|
|
if self._init_count >= self.init_min:
|
|
|
|
self._state = self.NOISE
|
|
|
|
if len(self._data) >= self.max_length:
|
|
|
|
return self._process_end_of_detection(True)
|
|
|
|
|
|
|
|
else:
|
|
|
|
self._silence_length += 1
|
|
|
|
if (
|
|
|
|
self._silence_length > self.init_max_silent
|
|
|
|
or len(self._data) + 1 >= self.max_length
|
|
|
|
):
|
|
|
|
# either init_max_silent or max_length is reached
|
|
|
|
# before _init_count, back to silence
|
|
|
|
self._data = []
|
|
|
|
self._state = self.SILENCE
|
|
|
|
else:
|
|
|
|
self._data.append(frame)
|
|
|
|
|
|
|
|
elif self._state == self.NOISE:
|
|
|
|
|
|
|
|
if frame_is_valid:
|
|
|
|
self._data.append(frame)
|
|
|
|
if len(self._data) >= self.max_length:
|
|
|
|
return self._process_end_of_detection(True)
|
|
|
|
|
|
|
|
elif self.max_continuous_silence <= 0:
|
|
|
|
# max token reached at this frame will _deliver if
|
|
|
|
# _contiguous_token and not _strict_min_length
|
|
|
|
self._state = self.SILENCE
|
|
|
|
return self._process_end_of_detection()
|
|
|
|
else:
|
|
|
|
# this is the first silent frame following a valid one
|
|
|
|
# and it is tolerated
|
|
|
|
self._silence_length = 1
|
|
|
|
self._data.append(frame)
|
|
|
|
self._state = self.POSSIBLE_SILENCE
|
|
|
|
if len(self._data) == self.max_length:
|
|
|
|
return self._process_end_of_detection(True)
|
|
|
|
# don't reset _silence_length because we still
|
|
|
|
# need to know the total number of silent frames
|
|
|
|
|
|
|
|
elif self._state == self.POSSIBLE_SILENCE:
|
|
|
|
|
|
|
|
if frame_is_valid:
|
|
|
|
self._data.append(frame)
|
|
|
|
self._silence_length = 0
|
|
|
|
self._state = self.NOISE
|
|
|
|
if len(self._data) >= self.max_length:
|
|
|
|
return self._process_end_of_detection(True)
|
|
|
|
|
|
|
|
else:
|
|
|
|
if self._silence_length >= self.max_continuous_silence:
|
|
|
|
self._state = self.SILENCE
|
|
|
|
if self._silence_length < len(self._data):
|
|
|
|
# _deliver only gathered frames aren't all silent
|
|
|
|
return self._process_end_of_detection()
|
|
|
|
self._data = []
|
|
|
|
self._silence_length = 0
|
|
|
|
else:
|
|
|
|
self._data.append(frame)
|
|
|
|
self._silence_length += 1
|
|
|
|
if len(self._data) >= self.max_length:
|
|
|
|
return self._process_end_of_detection(True)
|
|
|
|
# don't reset _silence_length because we still
|
|
|
|
# need to know the total number of silent frames
|
|
|
|
|
|
|
|
def _post_process(self):
|
|
|
|
if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
|
|
|
|
if len(self._data) > 0 and len(self._data) > self._silence_length:
|
|
|
|
return self._process_end_of_detection()
|
|
|
|
|
|
|
|
def _process_end_of_detection(self, truncated=False):
|
|
|
|
|
|
|
|
if (
|
|
|
|
not truncated
|
|
|
|
and self._drop_trailing_silence
|
|
|
|
and self._silence_length > 0
|
|
|
|
):
|
|
|
|
# happens if max_continuous_silence is reached
|
|
|
|
# or max_length is reached at a silent frame
|
|
|
|
self._data = self._data[0 : -self._silence_length]
|
|
|
|
|
|
|
|
if (len(self._data) >= self.min_length) or (
|
|
|
|
len(self._data) > 0
|
|
|
|
and not self._strict_min_length
|
|
|
|
and self._contiguous_token
|
|
|
|
):
|
|
|
|
|
|
|
|
start_frame = self._start_frame
|
|
|
|
end_frame = self._start_frame + len(self._data) - 1
|
|
|
|
data = self._data
|
|
|
|
self._data = []
|
|
|
|
token = (data, start_frame, end_frame)
|
|
|
|
|
|
|
|
if truncated:
|
|
|
|
# next token (if any) will start at _current_frame + 1
|
|
|
|
self._start_frame = self._current_frame + 1
|
|
|
|
# remember that it is contiguous with the just delivered one
|
|
|
|
self._contiguous_token = True
|
|
|
|
else:
|
|
|
|
self._contiguous_token = False
|
|
|
|
return token
|
|
|
|
else:
|
|
|
|
self._contiguous_token = False
|
|
|
|
|
|
|
|
self._data = []
|
|
|
|
|
|
|
|
def _append_token(self, data, start, end):
|
|
|
|
self._tokens.append((data, start, end))
|