"""
This module gathers processing ( i . e . tokenization ) classes .
Class summary
== == == == == == =
. . autosummary : :
StreamTokenizer
"""
from auditok . util import DataValidator
__all__ = [ " StreamTokenizer " ]
class StreamTokenizer ( ) :
"""
Class for stream tokenizers . It implements a 4 - state automaton scheme
to extract sub - sequences of interest on the fly .
: Parameters :
` validator ` :
instance of ` DataValidator ` that implements ` is_valid ` method .
` min_length ` : * ( int ) *
Minimum number of frames of a valid token . This includes all \
tolerated non valid frames within the token .
` max_length ` : * ( int ) *
Maximum number of frames of a valid token . This includes all \
tolerated non valid frames within the token .
` max_continuous_silence ` : * ( int ) *
Maximum number of consecutive non - valid frames within a token .
Note that , within a valid token , there may be many tolerated \
* silent * regions that contain each a number of non valid frames up to \
` max_continuous_silence `
` init_min ` : * ( int , default = 0 ) *
Minimum number of consecutive valid frames that must be * * initially * * \
gathered before any sequence of non valid frames can be tolerated . This
option is not always needed , it can be used to drop non - valid tokens as
early as possible . * * Default = 0 * * means that the option is by default
ineffective .
` init_max_silence ` : * ( int , default = 0 ) *
Maximum number of tolerated consecutive non - valid frames if the \
number already gathered valid frames has not yet reached ' init_min ' .
This argument is normally used if ` init_min ` is used . * * Default = 0 * * ,
by default this argument is not taken into consideration .
` mode ` : * ( int , default = 0 ) *
` mode ` can be :
1. ` StreamTokenizer . STRICT_MIN_LENGTH ` :
if token * i * is delivered because ` max_length `
is reached , and token * i + 1 * is immediately adjacent to
token * i * ( i . e . token * i * ends at frame * k * and token * i + 1 * starts
at frame * k + 1 * ) then accept token * i + 1 * only of it has a size of at
least ` min_length ` . The default behavior is to accept token * i + 1 *
event if it is shorter than ` min_length ` ( given that the above conditions
are fulfilled of course ) .
: Examples :
In the following code , without ` STRICT_MIN_LENGTH ` , the ' BB ' token is
accepted although it is shorter than ` min_length ` ( 3 ) , because it immediately
follows the latest delivered token :
. . code : : python
from auditok import StreamTokenizer , StringDataSource , DataValidator
class UpperCaseChecker ( DataValidator ) :
def is_valid ( self , frame ) :
return frame . isupper ( )
dsource = StringDataSource ( " aaaAAAABBbbb " )
tokenizer = StreamTokenizer ( validator = UpperCaseChecker ( ) ,
min_length = 3 ,
max_length = 4 ,
max_continuous_silence = 0 )
tokenizer . tokenize ( dsource )
: output :
. . code : : python
[ ( [ ' A ' , ' A ' , ' A ' , ' A ' ] , 3 , 6 ) , ( [ ' B ' , ' B ' ] , 7 , 8 ) ]
The following tokenizer will however reject the ' BB ' token :
. . code : : python
dsource = StringDataSource ( " aaaAAAABBbbb " )
tokenizer = StreamTokenizer ( validator = UpperCaseChecker ( ) ,
min_length = 3 , max_length = 4 ,
max_continuous_silence = 0 ,
mode = StreamTokenizer . STRICT_MIN_LENGTH )
tokenizer . tokenize ( dsource )
: output :
. . code : : python
[ ( [ ' A ' , ' A ' , ' A ' , ' A ' ] , 3 , 6 ) ]
2. ` StreamTokenizer . DROP_TRAILING_SILENCE ` : drop all tailing non - valid frames
from a token to be delivered if and only if it is not * * truncated * * .
This can be a bit tricky . A token is actually delivered if :
- a . ` max_continuous_silence ` is reached
: or :
- b . Its length reaches ` max_length ` . This is called a * * truncated * * token
In the current implementation , a ` StreamTokenizer ` ' s decision is only based on already seen
data and on incoming data . Thus , if a token is truncated at a non - valid but tolerated
frame ( ` max_length ` is reached but ` max_continuous_silence ` not yet ) any tailing
silence will be kept because it can potentially be part of valid token ( if ` max_length `
was bigger ) . But if ` max_continuous_silence ` is reached before ` max_length ` , the delivered
token will not be considered as truncated but a result of * normal * end of detection
( i . e . no more valid data ) . In that case the tailing silence can be removed if you use
the ` StreamTokenizer . DROP_TRAILING_SILENCE ` mode .
: Example :
. . code : : python
tokenizer = StreamTokenizer ( validator = UpperCaseChecker ( ) , min_length = 3 ,
max_length = 6 , max_continuous_silence = 3 ,
mode = StreamTokenizer . DROP_TRAILING_SILENCE )
dsource = StringDataSource ( " aaaAAAaaaBBbbbb " )
tokenizer . tokenize ( dsource )
: output :
. . code : : python
[ ( [ ' A ' , ' A ' , ' A ' , ' a ' , ' a ' , ' a ' ] , 3 , 8 ) , ( [ ' B ' , ' B ' ] , 9 , 10 ) ]
The first token is delivered with its tailing silence because it is truncated
while the second one has its tailing frames removed .
Without ` StreamTokenizer . DROP_TRAILING_SILENCE ` the output would be :
. . code : : python
[ ( [ ' A ' , ' A ' , ' A ' , ' a ' , ' a ' , ' a ' ] , 3 , 8 ) , ( [ ' B ' , ' B ' , ' b ' , ' b ' , ' b ' ] , 9 , 13 ) ]
3. ` StreamTokenizer . STRICT_MIN_LENGTH | StreamTokenizer . DROP_TRAILING_SILENCE ` :
use both options . That means : first remove tailing silence , then ckeck if the
token still has at least a length of ` min_length ` .
"""
SILENCE = 0
POSSIBLE_SILENCE = 1
POSSIBLE_NOISE = 2
NOISE = 3
STRICT_MIN_LENGTH = 2
DROP_TRAILING_SILENCE = 4
# alias
DROP_TAILING_SILENCE = 4
def __init__ ( self , validator ,
min_length , max_length , max_continuous_silence ,
init_min = 0 , init_max_silence = 0 ,
mode = 0 ) :
if not isinstance ( validator , DataValidator ) :
raise TypeError ( " ' validator ' must be an instance of ' DataValidator ' " )
if max_length < = 0 :
raise ValueError ( " ' max_length ' must be > 0 (value= {0} ) " . format ( max_length ) )
if min_length < = 0 or min_length > max_length :
raise ValueError ( " ' min_length ' must be > 0 and <= ' max_length ' (value= {0} ) " . format ( min_length ) )
if max_continuous_silence > = max_length :
raise ValueError ( " ' max_continuous_silence ' must be < ' max_length ' (value= {0} ) " . format ( max_continuous_silence ) )
if init_min > = max_length :
raise ValueError ( " ' init_min ' must be < ' max_length ' (value= {0} ) " . format ( max_continuous_silence ) )
self . validator = validator
self . min_length = min_length
self . max_length = max_length
self . max_continuous_silence = max_continuous_silence
self . init_min = init_min
self . init_max_silent = init_max_silence
self . _mode = None
self . set_mode ( mode )
self . _strict_min_length = ( mode & self . STRICT_MIN_LENGTH ) != 0
self . _drop_tailing_silence = ( mode & self . DROP_TRAILING_SILENCE ) != 0
self . _deliver = None
self . _tokens = None
self . _state = None
self . _data = None
self . _contiguous_token = False
self . _init_count = 0
self . _silence_length = 0
self . _start_frame = 0
self . _current_frame = 0
def set_mode ( self , mode ) :
"""
: Parameters :
` mode ` : * ( int ) *
New mode , must be one of :
- ` StreamTokenizer . STRICT_MIN_LENGTH `
- ` StreamTokenizer . DROP_TRAILING_SILENCE `
- ` StreamTokenizer . STRICT_MIN_LENGTH | StreamTokenizer . DROP_TRAILING_SILENCE `
- ` 0 `
See ` StreamTokenizer . __init__ ` for more information about the mode .
"""
if not mode in [ self . STRICT_MIN_LENGTH , self . DROP_TRAILING_SILENCE ,
self . STRICT_MIN_LENGTH | self . DROP_TRAILING_SILENCE , 0 ] :
raise ValueError ( " Wrong value for mode " )
self . _mode = mode
self . _strict_min_length = ( mode & self . STRICT_MIN_LENGTH ) != 0
self . _drop_tailing_silence = ( mode & self . DROP_TRAILING_SILENCE ) != 0
def get_mode ( self ) :
"""
Return the current mode . To check whether a specific mode is activated use
the bitwise ' and ' operator ` & ` . Example :
. . code : : python
if mode & self . STRICT_MIN_LENGTH != 0 :
do_something ( )
"""
return self . _mode
def _reinitialize ( self ) :
self . _contiguous_token = False
self . _data = [ ]
self . _tokens = [ ]
self . _state = self . SILENCE
self . _current_frame = - 1
self . _deliver = self . _append_token
def tokenize ( self , data_source , callback = None ) :
"""
Read data from ` data_source ` , one frame a time , and process the read frames in
order to detect sequences of frames that make up valid tokens .
: Parameters :
` data_source ` : instance of the : class : ` DataSource ` class that implements a ` read ` method .
' read ' should return a slice of signal , i . e . frame ( of whatever \
type as long as it can be processed by validator ) and None if \
there is no more signal .
` callback ` : an optional 3 - argument function .
If a ` callback ` function is given , it will be called each time a valid token
is found .
: Returns :
A list of tokens if ` callback ` is None . Each token is tuple with the following elements :
. . code python
( data , start , end )
where ` data ` is a list of read frames , ` start ` : index of the first frame in the
original data and ` end ` : index of the last frame .
"""
self . _reinitialize ( )
if callback is not None :
self . _deliver = callback
while True :
frame = data_source . read ( )
if frame is None :
break
self . _current_frame + = 1
self . _process ( frame )
self . _post_process ( )
if callback is None :
_ret = self . _tokens
self . _tokens = None
return _ret
def _process ( self , frame ) :
frame_is_valid = self . validator . is_valid ( frame )
if self . _state == self . SILENCE :
if frame_is_valid :
# seems we got a valid frame after a silence
self . _init_count = 1
self . _silence_length = 0
self . _start_frame = self . _current_frame
self . _data . append ( frame )
if self . _init_count > = self . init_min :
self . _state = self . NOISE
if len ( self . _data ) > = self . max_length :
self . _process_end_of_detection ( True )
else :
self . _state = self . POSSIBLE_NOISE
elif self . _state == self . POSSIBLE_NOISE :
if frame_is_valid :
self . _silence_length = 0
self . _init_count + = 1
self . _data . append ( frame )
if self . _init_count > = self . init_min :
self . _state = self . NOISE
if len ( self . _data ) > = self . max_length :
self . _process_end_of_detection ( True )
else :
self . _silence_length + = 1
if self . _silence_length > self . init_max_silent or \
len ( self . _data ) + 1 > = self . max_length :
# either init_max_silent or max_length is reached
# before _init_count, back to silence
self . _data = [ ]
self . _state = self . SILENCE
else :
self . _data . append ( frame )
elif self . _state == self . NOISE :
if frame_is_valid :
self . _data . append ( frame )
if len ( self . _data ) > = self . max_length :
self . _process_end_of_detection ( True )
elif self . max_continuous_silence < = 0 :
# max token reached at this frame will _deliver if _contiguous_token
# and not _strict_min_length
self . _process_end_of_detection ( )
self . _state = self . SILENCE
else :
# this is the first silent frame following a valid one
# and it is tolerated
self . _silence_length = 1
self . _data . append ( frame )
self . _state = self . POSSIBLE_SILENCE
if len ( self . _data ) == self . max_length :
self . _process_end_of_detection ( True )
# don't reset _silence_length because we still
# need to know the total number of silent frames
elif self . _state == self . POSSIBLE_SILENCE :
if frame_is_valid :
self . _data . append ( frame )
self . _silence_length = 0
self . _state = self . NOISE
if len ( self . _data ) > = self . max_length :
self . _process_end_of_detection ( True )
else :
if self . _silence_length > = self . max_continuous_silence :
if self . _silence_length < len ( self . _data ) :
# _deliver only gathered frames aren't all silent
self . _process_end_of_detection ( )
else :
self . _data = [ ]
self . _state = self . SILENCE
self . _silence_length = 0
else :
self . _data . append ( frame )
self . _silence_length + = 1
if len ( self . _data ) > = self . max_length :
self . _process_end_of_detection ( True )
# don't reset _silence_length because we still
# need to know the total number of silent frames
def _post_process ( self ) :
if self . _state == self . NOISE or self . _state == self . POSSIBLE_SILENCE :
if len ( self . _data ) > 0 and len ( self . _data ) > self . _silence_length :
self . _process_end_of_detection ( )
def _process_end_of_detection ( self , truncated = False ) :
if not truncated and self . _drop_tailing_silence and self . _silence_length > 0 :
# happens if max_continuous_silence is reached
# or max_length is reached at a silent frame
self . _data = self . _data [ 0 : - self . _silence_length ]
if ( len ( self . _data ) > = self . min_length ) or \
( len ( self . _data ) > 0 and \
not self . _strict_min_length and self . _contiguous_token ) :
_end_frame = self . _start_frame + len ( self . _data ) - 1
self . _deliver ( self . _data , self . _start_frame , _end_frame )
if truncated :
# next token (if any) will start at _current_frame + 1
self . _start_frame = self . _current_frame + 1
# remember that it is contiguous with the just delivered one
self . _contiguous_token = True
else :
self . _contiguous_token = False
else :
self . _contiguous_token = False
self . _data = [ ]
def _append_token ( self , data , start , end ) :
self . _tokens . append ( ( data , start , end ) )