bazarr/libs/auditok/core.py

"""
This module gathers processing (i.e. tokenization) classes.

Class summary
=============

.. autosummary::

        StreamTokenizer
"""

from auditok.util import DataValidator

__all__ = ["StreamTokenizer"]


class StreamTokenizer():
    """
    Class for stream tokenizers. It implements a 4-state automaton scheme
    to extract sub-sequences of interest on the fly.
    
    :Parameters:
    
        `validator` :
            instance of `DataValidator` that implements `is_valid` method.
        
        `min_length` : *(int)*
            Minimum number of frames of a valid token. This includes all \
            tolerated non valid frames within the token.
            
        `max_length` : *(int)*
            Maximum number of frames of a valid token. This includes all \
            tolerated non valid frames within the token.
        
        `max_continuous_silence` : *(int)*
            Maximum number of consecutive non-valid frames within a token.
            Note that, within a valid token, there may be many tolerated \
            *silent* regions that contain each a number of non valid frames up to \
            `max_continuous_silence`
        
        `init_min` : *(int, default=0)*
            Minimum number of consecutive valid frames that must be **initially** \
            gathered before any sequence of non valid frames can be tolerated. This
            option is not always needed, it can be used to drop non-valid tokens as
            early as possible. **Default = 0** means that the option is by default 
            ineffective. 
                
        `init_max_silence` : *(int, default=0)*
            Maximum number of tolerated consecutive non-valid frames if the \
            number already gathered valid frames has not yet reached 'init_min'.
            This argument is normally used if `init_min` is used. **Default = 0**,
            by default this argument is not taken into consideration.
            
        `mode` : *(int, default=0)*
            `mode` can be:
        
        1. `StreamTokenizer.STRICT_MIN_LENGTH`: 
        if token *i* is delivered because `max_length`
        is reached, and token *i+1* is immediately adjacent to
        token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
        at frame *k+1*) then accept token *i+1* only of it has a size of at
        least `min_length`. The default behavior is to accept token *i+1*
        event if it is shorter than `min_length` (given that the above conditions
        are fulfilled of course).
           
        :Examples:
               
        In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
        accepted although it is shorter than `min_length` (3), because it immediately
        follows the latest delivered token:
            
        .. code:: python
        
            from auditok import StreamTokenizer, StringDataSource, DataValidator
    
            class UpperCaseChecker(DataValidator):
                def is_valid(self, frame):
                    return frame.isupper()
                   
    
            dsource = StringDataSource("aaaAAAABBbbb")
            tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
                                        min_length=3,
                                        max_length=4,
                                        max_continuous_silence=0)
         
            tokenizer.tokenize(dsource)
                    
                    
        :output:
    
         .. code:: python
         
            [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]


        The following tokenizer will however reject the 'BB' token:
     
        .. code:: python
                
            dsource = StringDataSource("aaaAAAABBbbb")
            tokenizer = StreamTokenizer(validator=UpperCaseChecker(), 
                                        min_length=3, max_length=4,
                                        max_continuous_silence=0,
                                        mode=StreamTokenizer.STRICT_MIN_LENGTH)
            tokenizer.tokenize(dsource)
        
        :output:
            
        .. code:: python
            
            [(['A', 'A', 'A', 'A'], 3, 6)]
            
           
        2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames
        from a token to be delivered if and only if it is not **truncated**.
        This can be a bit tricky. A token is actually delivered if:
           
        - a. `max_continuous_silence` is reached
           
        :or:
           
        - b. Its length reaches `max_length`. This is called a **truncated** token
           
        In the current implementation, a `StreamTokenizer`'s decision is only based on already seen
        data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
        frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
        silence will be kept because it can potentially be part of valid token (if `max_length`
        was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered
        token will not be considered as truncated but a result of *normal* end of detection
        (i.e. no more valid data). In that case the tailing silence can be removed if you use
        the `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
    
        :Example:
    
        .. code:: python
                       
             tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
                                         max_length=6, max_continuous_silence=3,
                                         mode=StreamTokenizer.DROP_TRAILING_SILENCE)
            
             dsource = StringDataSource("aaaAAAaaaBBbbbb")
             tokenizer.tokenize(dsource)
        
        :output:
            
        .. code:: python
                
            [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
                    
        The first token is delivered with its tailing silence because it is truncated
        while the second one has its tailing frames removed.
                    
        Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
                        
        .. code:: python
         
            [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
    
        
        3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`:
        use both options. That means: first remove tailing silence, then ckeck if the
        token still has at least a length of `min_length`.
    """
    
    
    SILENCE = 0
    POSSIBLE_SILENCE = 1
    POSSIBLE_NOISE = 2 
    NOISE = 3
    
    STRICT_MIN_LENGTH = 2
    DROP_TRAILING_SILENCE = 4
    # alias
    DROP_TAILING_SILENCE = 4
    
    def __init__(self, validator, 
                 min_length, max_length, max_continuous_silence,
                 init_min=0, init_max_silence=0,
                 mode=0):
        
        if not isinstance(validator, DataValidator):
            raise TypeError("'validator' must be an instance of 'DataValidator'")
        
        if max_length <= 0:
            raise ValueError("'max_length' must be > 0 (value={0})".format(max_length))
        
        if min_length <= 0 or min_length > max_length:
            raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length))
        
        if max_continuous_silence >= max_length:
            raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence))
        
        if init_min >= max_length:
            raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence))
            
        self.validator = validator
        self.min_length = min_length
        self.max_length = max_length
        self.max_continuous_silence = max_continuous_silence
        self.init_min = init_min
        self.init_max_silent = init_max_silence
        
        self._mode = None
        self.set_mode(mode)
        self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
        self._drop_tailing_silence  = (mode & self.DROP_TRAILING_SILENCE) != 0
        
        self._deliver = None
        self._tokens = None
        self._state = None
        self._data = None
        self._contiguous_token = False
        
        self._init_count = 0
        self._silence_length = 0
        self._start_frame = 0
        self._current_frame = 0
    
    def set_mode(self, mode):
        """
        :Parameters:
        
            `mode` : *(int)*
                New mode, must be one of:
                    
                
            - `StreamTokenizer.STRICT_MIN_LENGTH`
            
            - `StreamTokenizer.DROP_TRAILING_SILENCE`
            
            - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`
                   
            - `0`
                       
        See `StreamTokenizer.__init__` for more information about the mode.
        """
        
        if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE,
           self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]:
            
            raise ValueError("Wrong value for mode")
        
        self._mode = mode
        self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
        self._drop_tailing_silence  = (mode & self.DROP_TRAILING_SILENCE) != 0
        
    
    def get_mode(self):
        """
        Return the current mode. To check whether a specific mode is activated use
        the bitwise 'and' operator `&`. Example:
           
        .. code:: python 
                
            if mode & self.STRICT_MIN_LENGTH != 0:
               do_something()
        """
        return self._mode
        
    def _reinitialize(self):
        self._contiguous_token = False
        self._data = []
        self._tokens = []
        self._state = self.SILENCE
        self._current_frame = -1
        self._deliver = self._append_token
    
    
    def tokenize(self, data_source, callback=None):
        """
        Read data from `data_source`, one frame a time, and process the read frames in
        order to detect sequences of frames that make up valid tokens.
        
        :Parameters:
           `data_source` : instance of the :class:`DataSource` class that implements a `read` method.
               'read' should return a slice of signal, i.e. frame (of whatever \
               type as long as it can be processed by validator) and None if \
               there is no more signal.
        
           `callback` : an optional 3-argument function.
               If a `callback` function is given, it will be called each time a valid token
               is found.
           
           
        :Returns:
           A list of tokens if `callback` is None. Each token is tuple with the following elements:
        
            .. code python
            
                (data, start, end)
            
           where `data` is a list of read frames, `start`: index of the first frame in the
           original data and `end` : index of the last frame. 
        
        """
        
        self._reinitialize()
        
        if callback is not None:
            self._deliver = callback
        
        while True:
            frame =  data_source.read()
            if frame is None:
                break
            self._current_frame += 1
            self._process(frame)
            
        self._post_process()
        
        if callback is None:
            _ret = self._tokens
            self._tokens = None
            return _ret
        
        
    def _process(self, frame):
        
        frame_is_valid = self.validator.is_valid(frame)
        
        if self._state == self.SILENCE:
            
            if frame_is_valid:
                # seems we got a valid frame after a silence
                self._init_count = 1
                self._silence_length = 0
                self._start_frame = self._current_frame
                self._data.append(frame)
                
                if self._init_count  >= self.init_min:
                    self._state = self.NOISE
                    if len(self._data) >= self.max_length:
                        self._process_end_of_detection(True)
                else:
                    self._state = self.POSSIBLE_NOISE
        
        elif self._state == self.POSSIBLE_NOISE:
            
            if frame_is_valid:
                self._silence_length = 0
                self._init_count += 1
                self._data.append(frame)
                if self._init_count  >= self.init_min:
                    self._state = self.NOISE
                    if len(self._data) >= self.max_length:
                        self._process_end_of_detection(True)
            
            else:                
                self._silence_length += 1
                if self._silence_length > self.init_max_silent or \
                len(self._data) + 1 >= self.max_length:
                    # either init_max_silent or max_length is reached
                    # before _init_count, back to silence
                    self._data = []
                    self._state = self.SILENCE
                else:
                    self._data.append(frame)
                    
                
        elif self._state == self.NOISE:
            
            if frame_is_valid:
                self._data.append(frame)
                if len(self._data) >= self.max_length:
                    self._process_end_of_detection(True)
            
            elif self.max_continuous_silence <= 0 :
                # max token reached at this frame will _deliver if _contiguous_token
                # and not _strict_min_length
                self._process_end_of_detection()
                self._state = self.SILENCE
                
            else:
                # this is the first silent frame following a valid one
                # and it is tolerated
                self._silence_length = 1
                self._data.append(frame)
                self._state = self.POSSIBLE_SILENCE
                if len(self._data) == self.max_length:
                    self._process_end_of_detection(True)
                    # don't reset _silence_length because we still 
                    # need to know the total number of silent frames
                                   
                                
        elif self._state == self.POSSIBLE_SILENCE:
            
            if frame_is_valid:
                self._data.append(frame)
                self._silence_length = 0
                self._state = self.NOISE
                if len(self._data) >= self.max_length:
                    self._process_end_of_detection(True)
                
            else:
                if self._silence_length >= self.max_continuous_silence:
                    if self._silence_length < len(self._data):
                        # _deliver only gathered frames aren't all silent                    
                        self._process_end_of_detection()
                    else:
                        self._data = []
                    self._state = self.SILENCE
                    self._silence_length = 0
                else:
                    self._data.append(frame)
                    self._silence_length += 1
                    if len(self._data) >= self.max_length:
                        self._process_end_of_detection(True)
                        # don't reset _silence_length because we still 
                        # need to know the total number of silent frames
                        
    
    def _post_process(self):
        if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
            if len(self._data) > 0 and len(self._data) > self._silence_length:
                self._process_end_of_detection()
    
    
    def _process_end_of_detection(self, truncated=False):
        
        if not truncated and self._drop_tailing_silence and self._silence_length > 0:
            # happens if max_continuous_silence is reached
            # or max_length is reached at a silent frame
            self._data = self._data[0: - self._silence_length]
        
        if (len(self._data) >= self.min_length) or \
           (len(self._data) > 0 and \
            not self._strict_min_length and self._contiguous_token):
            
            
            _end_frame = self._start_frame + len(self._data) - 1
            self._deliver(self._data, self._start_frame, _end_frame)
            
            if truncated:
                # next token (if any) will start at _current_frame + 1
                self._start_frame = self._current_frame + 1
                # remember that it is contiguous with the just delivered one
                self._contiguous_token = True
            else:
                self._contiguous_token = False
        else:
            self._contiguous_token = False       
        
        self._data = []
            
    
    def _append_token(self, data, start, end):
        self._tokens.append((data, start, end))