#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Classes and functions related to matches
"""
import copy
import itertools
from collections import OrderedDict
from collections import defaultdict
from collections . abc import MutableSequence
from . debug import defined_at
from . loose import ensure_list , filter_index
from . utils import is_iterable
class MatchesDict ( OrderedDict ) :
"""
A custom dict with matches property .
"""
def __init__ ( self ) :
super ( ) . __init__ ( )
self . matches = defaultdict ( list )
self . values_list = defaultdict ( list )
class _BaseMatches ( MutableSequence ) :
"""
A custom list [ Match ] that automatically maintains name , tag , start and end lookup structures .
"""
_base = list
_base_add = _base . append
_base_remove = _base . remove
_base_extend = _base . extend
def __init__ ( self , matches = None , input_string = None ) : # pylint: disable=super-init-not-called
self . input_string = input_string
self . _max_end = 0
self . _delegate = [ ]
self . __name_dict = None
self . __tag_dict = None
self . __start_dict = None
self . __end_dict = None
self . __index_dict = None
if matches :
self . extend ( matches )
@property
def _name_dict ( self ) :
if self . __name_dict is None :
self . __name_dict = defaultdict ( _BaseMatches . _base )
for name , values in itertools . groupby ( [ m for m in self . _delegate if m . name ] , lambda item : item . name ) :
_BaseMatches . _base_extend ( self . __name_dict [ name ] , values )
return self . __name_dict
@property
def _start_dict ( self ) :
if self . __start_dict is None :
self . __start_dict = defaultdict ( _BaseMatches . _base )
for start , values in itertools . groupby ( list ( self . _delegate ) , lambda item : item . start ) :
_BaseMatches . _base_extend ( self . __start_dict [ start ] , values )
return self . __start_dict
@property
def _end_dict ( self ) :
if self . __end_dict is None :
self . __end_dict = defaultdict ( _BaseMatches . _base )
for start , values in itertools . groupby ( list ( self . _delegate ) , lambda item : item . end ) :
_BaseMatches . _base_extend ( self . __end_dict [ start ] , values )
return self . __end_dict
@property
def _tag_dict ( self ) :
if self . __tag_dict is None :
self . __tag_dict = defaultdict ( _BaseMatches . _base )
for match in self . _delegate :
for tag in match . tags :
_BaseMatches . _base_add ( self . __tag_dict [ tag ] , match )
return self . __tag_dict
@property
def _index_dict ( self ) :
if self . __index_dict is None :
self . __index_dict = defaultdict ( _BaseMatches . _base )
for match in self . _delegate :
for index in range ( * match . span ) :
_BaseMatches . _base_add ( self . __index_dict [ index ] , match )
return self . __index_dict
def _add_match ( self , match ) :
"""
Add a match
: param match :
: type match : Match
"""
if self . __name_dict is not None :
if match . name :
_BaseMatches . _base_add ( self . _name_dict [ match . name ] , ( match ) )
if self . __tag_dict is not None :
for tag in match . tags :
_BaseMatches . _base_add ( self . _tag_dict [ tag ] , match )
if self . __start_dict is not None :
_BaseMatches . _base_add ( self . _start_dict [ match . start ] , match )
if self . __end_dict is not None :
_BaseMatches . _base_add ( self . _end_dict [ match . end ] , match )
if self . __index_dict is not None :
for index in range ( * match . span ) :
_BaseMatches . _base_add ( self . _index_dict [ index ] , match )
if match . end > self . _max_end :
self . _max_end = match . end
def _remove_match ( self , match ) :
"""
Remove a match
: param match :
: type match : Match
"""
if self . __name_dict is not None :
if match . name :
_BaseMatches . _base_remove ( self . _name_dict [ match . name ] , match )
if self . __tag_dict is not None :
for tag in match . tags :
_BaseMatches . _base_remove ( self . _tag_dict [ tag ] , match )
if self . __start_dict is not None :
_BaseMatches . _base_remove ( self . _start_dict [ match . start ] , match )
if self . __end_dict is not None :
_BaseMatches . _base_remove ( self . _end_dict [ match . end ] , match )
if self . __index_dict is not None :
for index in range ( * match . span ) :
_BaseMatches . _base_remove ( self . _index_dict [ index ] , match )
if match . end > = self . _max_end and not self . _end_dict [ match . end ] :
self . _max_end = max ( self . _end_dict . keys ( ) )
def previous ( self , match , predicate = None , index = None ) :
"""
Retrieves the nearest previous matches .
: param match :
: type match :
: param predicate :
: type predicate :
: param index :
: type index : int
: return :
: rtype :
"""
current = match . start
while current > - 1 :
previous_matches = self . ending ( current )
if previous_matches :
return filter_index ( previous_matches , predicate , index )
current - = 1
return filter_index ( _BaseMatches . _base ( ) , predicate , index )
def next ( self , match , predicate = None , index = None ) :
"""
Retrieves the nearest next matches .
: param match :
: type match :
: param predicate :
: type predicate :
: param index :
: type index : int
: return :
: rtype :
"""
current = match . start + 1
while current < = self . _max_end :
next_matches = self . starting ( current )
if next_matches :
return filter_index ( next_matches , predicate , index )
current + = 1
return filter_index ( _BaseMatches . _base ( ) , predicate , index )
def named ( self , name , predicate = None , index = None ) :
"""
Retrieves a set of Match objects that have the given name .
: param name :
: type name : str
: param predicate :
: type predicate :
: param index :
: type index : int
: return : set of matches
: rtype : set [ Match ]
"""
return filter_index ( _BaseMatches . _base ( self . _name_dict [ name ] ) , predicate , index )
def tagged ( self , tag , predicate = None , index = None ) :
"""
Retrieves a set of Match objects that have the given tag defined .
: param tag :
: type tag : str
: param predicate :
: type predicate :
: param index :
: type index : int
: return : set of matches
: rtype : set [ Match ]
"""
return filter_index ( _BaseMatches . _base ( self . _tag_dict [ tag ] ) , predicate , index )
def starting ( self , start , predicate = None , index = None ) :
"""
Retrieves a set of Match objects that starts at given index .
: param start : the starting index
: type start : int
: param predicate :
: type predicate :
: param index :
: type index : int
: return : set of matches
: rtype : set [ Match ]
"""
return filter_index ( _BaseMatches . _base ( self . _start_dict [ start ] ) , predicate , index )
def ending ( self , end , predicate = None , index = None ) :
"""
Retrieves a set of Match objects that ends at given index .
: param end : the ending index
: type end : int
: param predicate :
: type predicate :
: return : set of matches
: rtype : set [ Match ]
"""
return filter_index ( _BaseMatches . _base ( self . _end_dict [ end ] ) , predicate , index )
def range ( self , start = 0 , end = None , predicate = None , index = None ) :
"""
Retrieves a set of Match objects that are available in given range , sorted from start to end .
: param start : the starting index
: type start : int
: param end : the ending index
: type end : int
: param predicate :
: type predicate :
: param index :
: type index : int
: return : set of matches
: rtype : set [ Match ]
"""
if end is None :
end = self . max_end
else :
end = min ( self . max_end , end )
ret = _BaseMatches . _base ( )
for match in sorted ( self ) :
if match . start < end and match . end > start :
ret . append ( match )
return filter_index ( ret , predicate , index )
def chain_before ( self , position , seps , start = 0 , predicate = None , index = None ) :
"""
Retrieves a list of chained matches , before position , matching predicate and separated by characters from seps
only .
: param position :
: type position :
: param seps :
: type seps :
: param start :
: type start :
: param predicate :
: type predicate :
: param index :
: type index :
: return :
: rtype :
"""
if hasattr ( position , ' start ' ) :
position = position . start
chain = _BaseMatches . _base ( )
position = min ( self . max_end , position )
for i in reversed ( range ( start , position ) ) :
index_matches = self . at_index ( i )
filtered_matches = [ index_match for index_match in index_matches if not predicate or predicate ( index_match ) ]
if filtered_matches :
for chain_match in filtered_matches :
if chain_match not in chain :
chain . append ( chain_match )
elif self . input_string [ i ] not in seps :
break
return filter_index ( chain , predicate , index )
def chain_after ( self , position , seps , end = None , predicate = None , index = None ) :
"""
Retrieves a list of chained matches , after position , matching predicate and separated by characters from seps
only .
: param position :
: type position :
: param seps :
: type seps :
: param end :
: type end :
: param predicate :
: type predicate :
: param index :
: type index :
: return :
: rtype :
"""
if hasattr ( position , ' end ' ) :
position = position . end
chain = _BaseMatches . _base ( )
if end is None :
end = self . max_end
else :
end = min ( self . max_end , end )
for i in range ( position , end ) :
index_matches = self . at_index ( i )
filtered_matches = [ index_match for index_match in index_matches if not predicate or predicate ( index_match ) ]
if filtered_matches :
for chain_match in filtered_matches :
if chain_match not in chain :
chain . append ( chain_match )
elif self . input_string [ i ] not in seps :
break
return filter_index ( chain , predicate , index )
@property
def max_end ( self ) :
"""
Retrieves the maximum index .
: return :
"""
return max ( len ( self . input_string ) , self . _max_end ) if self . input_string else self . _max_end
def _hole_start ( self , position , ignore = None ) :
"""
Retrieves the start of hole index from position .
: param position :
: type position :
: param ignore :
: type ignore :
: return :
: rtype :
"""
for lindex in reversed ( range ( 0 , position ) ) :
for starting in self . starting ( lindex ) :
if not ignore or not ignore ( starting ) :
return lindex
return 0
def _hole_end ( self , position , ignore = None ) :
"""
Retrieves the end of hole index from position .
: param position :
: type position :
: param ignore :
: type ignore :
: return :
: rtype :
"""
for rindex in range ( position , self . max_end ) :
for starting in self . starting ( rindex ) :
if not ignore or not ignore ( starting ) :
return rindex
return self . max_end
def holes ( self , start = 0 , end = None , formatter = None , ignore = None , seps = None , predicate = None ,
index = None ) : # pylint: disable=too-many-branches,too-many-locals
"""
Retrieves a set of Match objects that are not defined in given range .
: param start :
: type start :
: param end :
: type end :
: param formatter :
: type formatter :
: param ignore :
: type ignore :
: param seps :
: type seps :
: param predicate :
: type predicate :
: param index :
: type index :
: return :
: rtype :
"""
assert self . input_string if seps else True , " input_string must be defined when using seps parameter "
if end is None :
end = self . max_end
else :
end = min ( self . max_end , end )
ret = _BaseMatches . _base ( )
hole = False
rindex = start
loop_start = self . _hole_start ( start , ignore )
for rindex in range ( loop_start , end ) :
current = [ ]
for at_index in self . at_index ( rindex ) :
if not ignore or not ignore ( at_index ) :
current . append ( at_index )
if seps and hole and self . input_string and self . input_string [ rindex ] in seps :
hole = False
ret [ - 1 ] . end = rindex
else :
if not current and not hole :
# Open a new hole match
hole = True
ret . append ( Match ( max ( rindex , start ) , None , input_string = self . input_string , formatter = formatter ) )
elif current and hole :
# Close current hole match
hole = False
ret [ - 1 ] . end = rindex
if ret and hole :
# go the the next starting element ...
ret [ - 1 ] . end = min ( self . _hole_end ( rindex , ignore ) , end )
return filter_index ( ret , predicate , index )
def conflicting ( self , match , predicate = None , index = None ) :
"""
Retrieves a list of ` ` Match ` ` objects that conflicts with given match .
: param match :
: type match :
: param predicate :
: type predicate :
: param index :
: type index :
: return :
: rtype :
"""
ret = _BaseMatches . _base ( )
for i in range ( * match . span ) :
for at_match in self . at_index ( i ) :
if at_match not in ret :
ret . append ( at_match )
ret . remove ( match )
return filter_index ( ret , predicate , index )
def at_match ( self , match , predicate = None , index = None ) :
"""
Retrieves a list of matches from given match .
"""
return self . at_span ( match . span , predicate , index )
def at_span ( self , span , predicate = None , index = None ) :
"""
Retrieves a list of matches from given ( start , end ) tuple .
"""
starting = self . _index_dict [ span [ 0 ] ]
ending = self . _index_dict [ span [ 1 ] - 1 ]
merged = list ( starting )
for marker in ending :
if marker not in merged :
merged . append ( marker )
return filter_index ( merged , predicate , index )
def at_index ( self , pos , predicate = None , index = None ) :
"""
Retrieves a list of matches from given position
"""
return filter_index ( self . _index_dict [ pos ] , predicate , index )
@property
def names ( self ) :
"""
Retrieve all names .
: return :
"""
return self . _name_dict . keys ( )
@property
def tags ( self ) :
"""
Retrieve all tags .
: return :
"""
return self . _tag_dict . keys ( )
def to_dict ( self , details = False , first_value = False , enforce_list = False ) :
"""
Converts matches to a dict object .
: param details if True , values will be complete Match object , else it will be only string Match . value property
: type details : bool
: param first_value if True , only the first value will be kept . Else , multiple values will be set as a list in
the dict .
: type first_value : bool
: param enforce_list : if True , value is wrapped in a list even when a single value is found . Else , list values
are available under ` values_list ` property of the returned dict object .
: type enforce_list : bool
: return :
: rtype : dict
"""
ret = MatchesDict ( )
for match in sorted ( self ) :
value = match if details else match . value
ret . matches [ match . name ] . append ( match )
if not enforce_list and value not in ret . values_list [ match . name ] :
ret . values_list [ match . name ] . append ( value )
if match . name in ret . keys ( ) :
if not first_value :
if not isinstance ( ret [ match . name ] , list ) :
if ret [ match . name ] == value :
continue
ret [ match . name ] = [ ret [ match . name ] ]
else :
if value in ret [ match . name ] :
continue
ret [ match . name ] . append ( value )
else :
if enforce_list and not isinstance ( value , list ) :
ret [ match . name ] = [ value ]
else :
ret [ match . name ] = value
return ret
def __len__ ( self ) :
return len ( self . _delegate )
def __getitem__ ( self , index ) :
ret = self . _delegate [ index ]
if isinstance ( ret , list ) :
return Matches ( ret )
return ret
def __setitem__ ( self , index , match ) :
self . _delegate [ index ] = match
if isinstance ( index , slice ) :
for match_item in match :
self . _add_match ( match_item )
return
self . _add_match ( match )
def __delitem__ ( self , index ) :
match = self . _delegate [ index ]
del self . _delegate [ index ]
if isinstance ( match , list ) :
# if index is a slice, we has a match list
for match_item in match :
self . _remove_match ( match_item )
else :
self . _remove_match ( match )
def __repr__ ( self ) :
return self . _delegate . __repr__ ( )
def insert ( self , index , value ) :
self . _delegate . insert ( index , value )
self . _add_match ( value )
class Matches ( _BaseMatches ) :
"""
A custom list [ Match ] contains matches list .
"""
def __init__ ( self , matches = None , input_string = None ) :
self . markers = Markers ( input_string = input_string )
super ( ) . __init__ ( matches = matches , input_string = input_string )
def _add_match ( self , match ) :
assert not match . marker , " A marker match should not be added to <Matches> object "
super ( ) . _add_match ( match )
class Markers ( _BaseMatches ) :
"""
A custom list [ Match ] containing markers list .
"""
def __init__ ( self , matches = None , input_string = None ) :
super ( ) . __init__ ( matches = None , input_string = input_string )
def _add_match ( self , match ) :
assert match . marker , " A non-marker match should not be added to <Markers> object "
super ( ) . _add_match ( match )
class Match ( object ) :
"""
Object storing values related to a single match
"""
def __init__ ( self , start , end , value = None , name = None , tags = None , marker = None , parent = None , private = None ,
pattern = None , input_string = None , formatter = None , conflict_solver = None , * * kwargs ) :
# pylint: disable=unused-argument
self . start = start
self . end = end
self . name = name
self . _value = value
self . tags = ensure_list ( tags )
self . marker = marker
self . parent = parent
self . input_string = input_string
self . formatter = formatter
self . pattern = pattern
self . private = private
self . conflict_solver = conflict_solver
self . _children = None
self . _raw_start = None
self . _raw_end = None
self . defined_at = pattern . defined_at if pattern else defined_at ( )
@property
def span ( self ) :
"""
2 - tuple with start and end indices of the match
"""
return self . start , self . end
@property
def children ( self ) :
"""
Children matches .
"""
if self . _children is None :
self . _children = Matches ( None , self . input_string )
return self . _children
@children.setter
def children ( self , value ) :
self . _children = value
@property
def value ( self ) :
"""
Get the value of the match , using formatter if defined .
: return :
: rtype :
"""
if self . _value :
return self . _value
if self . formatter :
return self . formatter ( self . raw )
return self . raw
@value.setter
def value ( self , value ) :
"""
Set the value ( hardcode )
: param value :
: type value :
: return :
: rtype :
"""
self . _value = value # pylint: disable=attribute-defined-outside-init
@property
def names ( self ) :
"""
Get all names of children
: return :
: rtype :
"""
if not self . children :
return set ( [ self . name ] )
ret = set ( )
for child in self . children :
for name in child . names :
ret . add ( name )
return ret
@property
def raw_start ( self ) :
"""
start index of raw value
: return :
: rtype :
"""
if self . _raw_start is None :
return self . start
return self . _raw_start
@raw_start.setter
def raw_start ( self , value ) :
"""
Set start index of raw value
: return :
: rtype :
"""
self . _raw_start = value
@property
def raw_end ( self ) :
"""
end index of raw value
: return :
: rtype :
"""
if self . _raw_end is None :
return self . end
return self . _raw_end
@raw_end.setter
def raw_end ( self , value ) :
"""
Set end index of raw value
: return :
: rtype :
"""
self . _raw_end = value
@property
def raw ( self ) :
"""
Get the raw value of the match , without using hardcoded value nor formatter .
: return :
: rtype :
"""
if self . input_string :
return self . input_string [ self . raw_start : self . raw_end ]
return None
@property
def initiator ( self ) :
"""
Retrieve the initiator parent of a match
: param match :
: type match :
: return :
: rtype :
"""
match = self
while match . parent :
match = match . parent
return match
def crop ( self , crops , predicate = None , index = None ) :
"""
crop the match with given Match objects or spans tuples
: param crops :
: type crops : list or object
: return : a list of Match objects
: rtype : list [ Match ]
"""
if not is_iterable ( crops ) or len ( crops ) == 2 and isinstance ( crops [ 0 ] , int ) :
crops = [ crops ]
initial = copy . deepcopy ( self )
ret = [ initial ]
for crop in crops :
if hasattr ( crop , ' span ' ) :
start , end = crop . span
else :
start , end = crop
for current in list ( ret ) :
if start < = current . start and end > = current . end :
# self is included in crop, remove current ...
ret . remove ( current )
elif start > = current . start and end < = current . end :
# crop is included in self, split current ...
right = copy . deepcopy ( current )
current . end = start
if not current :
ret . remove ( current )
right . start = end
if right :
ret . append ( right )
elif current . end > = end > current . start :
current . start = end
elif current . start < = start < current . end :
current . end = start
return filter_index ( ret , predicate , index )
def split ( self , seps , predicate = None , index = None ) :
"""
Split this match in multiple matches using given separators .
: param seps :
: type seps : string containing separator characters
: return : list of new Match objects
: rtype : list
"""
split_match = copy . deepcopy ( self )
current_match = split_match
ret = [ ]
for i , char in enumerate ( self . raw ) :
if char in seps :
if not split_match :
split_match = copy . deepcopy ( current_match )
current_match . end = self . start + i
else :
if split_match :
split_match . start = self . start + i
current_match = split_match
ret . append ( split_match )
split_match = None
return filter_index ( ret , predicate , index )
def tagged ( self , * tags ) :
"""
Check if this match has at least one of the provided tags
: param tags :
: return : True if at least one tag is defined , False otherwise .
"""
return any ( tag in self . tags for tag in tags )
def named ( self , * names ) :
"""
Check if one of the children match has one of the provided name
: param names :
: return : True if at least one child is named with a given name is defined , False otherwise .
"""
return any ( name in self . names for name in names )
def __len__ ( self ) :
return self . end - self . start
def __hash__ ( self ) :
return hash ( Match ) + hash ( self . start ) + hash ( self . end ) + hash ( self . value )
def __eq__ ( self , other ) :
if isinstance ( other , Match ) :
return self . span == other . span and self . value == other . value and self . name == other . name and \
self . parent == other . parent
return NotImplemented
def __ne__ ( self , other ) :
if isinstance ( other , Match ) :
return self . span != other . span or self . value != other . value or self . name != other . name or \
self . parent != other . parent
return NotImplemented
def __lt__ ( self , other ) :
if isinstance ( other , Match ) :
return self . span < other . span
return NotImplemented
def __gt__ ( self , other ) :
if isinstance ( other , Match ) :
return self . span > other . span
return NotImplemented
def __le__ ( self , other ) :
if isinstance ( other , Match ) :
return self . span < = other . span
return NotImplemented
def __ge__ ( self , other ) :
if isinstance ( other , Match ) :
return self . span > = other . span
return NotImplemented
def __repr__ ( self ) :
flags = " "
name = " "
tags = " "
defined = " "
initiator = " "
if self . initiator . value != self . value :
initiator = f " +initiator= { self . initiator . value } "
if self . private :
flags + = ' +private '
if self . name :
name = f " +name= { self . name } "
if self . tags :
tags = f " +tags= { self . tags } "
if self . defined_at :
defined + = f " @ { self . defined_at } "
return f " < { self . value } : { self . span } { flags } { name } { tags } { initiator } { defined } > "