""" A collection of modules for iterating through different kinds of
tree , generating tokens identical to those produced by the tokenizer
module .
To create a tree walker for a new type of tree , you need to
implement a tree walker object ( called TreeWalker by convention ) that
implements a ' serialize ' method which takes a tree as sole argument and
returns an iterator which generates tokens .
"""
from __future__ import absolute_import , division , unicode_literals
from . . import constants
from . . _utils import default_etree
__all__ = [ " getTreeWalker " , " pprint " ]
treeWalkerCache = { }
def getTreeWalker ( treeType , implementation = None , * * kwargs ) :
""" Get a TreeWalker class for various types of tree with built-in support
: arg str treeType : the name of the tree type required ( case - insensitive ) .
Supported values are :
* " dom " : The xml . dom . minidom DOM implementation
* " etree " : A generic walker for tree implementations exposing an
elementtree - like interface ( known to work with ElementTree ,
cElementTree and lxml . etree ) .
* " lxml " : Optimized walker for lxml . etree
* " genshi " : a Genshi stream
: arg implementation : A module implementing the tree type e . g .
xml . etree . ElementTree or cElementTree ( Currently applies to the " etree "
tree type only ) .
: arg kwargs : keyword arguments passed to the etree walker - - for other
walkers , this has no effect
: returns : a TreeWalker class
"""
treeType = treeType . lower ( )
if treeType not in treeWalkerCache :
if treeType == " dom " :
from . import dom
treeWalkerCache [ treeType ] = dom . TreeWalker
elif treeType == " genshi " :
from . import genshi
treeWalkerCache [ treeType ] = genshi . TreeWalker
elif treeType == " lxml " :
from . import etree_lxml
treeWalkerCache [ treeType ] = etree_lxml . TreeWalker
elif treeType == " etree " :
from . import etree
if implementation is None :
implementation = default_etree
# XXX: NEVER cache here, caching is done in the etree submodule
return etree . getETreeModule ( implementation , * * kwargs ) . TreeWalker
return treeWalkerCache . get ( treeType )
def concatenateCharacterTokens ( tokens ) :
pendingCharacters = [ ]
for token in tokens :
type = token [ " type " ]
if type in ( " Characters " , " SpaceCharacters " ) :
pendingCharacters . append ( token [ " data " ] )
else :
if pendingCharacters :
yield { " type " : " Characters " , " data " : " " . join ( pendingCharacters ) }
pendingCharacters = [ ]
yield token
if pendingCharacters :
yield { " type " : " Characters " , " data " : " " . join ( pendingCharacters ) }
def pprint ( walker ) :
""" Pretty printer for tree walkers
Takes a TreeWalker instance and pretty prints the output of walking the tree .
: arg walker : a TreeWalker instance
"""
output = [ ]
indent = 0
for token in concatenateCharacterTokens ( walker ) :
type = token [ " type " ]
if type in ( " StartTag " , " EmptyTag " ) :
# tag name
if token [ " namespace " ] and token [ " namespace " ] != constants . namespaces [ " html " ] :
if token [ " namespace " ] in constants . prefixes :
ns = constants . prefixes [ token [ " namespace " ] ]
else :
ns = token [ " namespace " ]
name = " %s %s " % ( ns , token [ " name " ] )
else :
name = token [ " name " ]
output . append ( " %s < %s > " % ( " " * indent , name ) )
indent + = 2
# attributes (sorted for consistent ordering)
attrs = token [ " data " ]
for ( namespace , localname ) , value in sorted ( attrs . items ( ) ) :
if namespace :
if namespace in constants . prefixes :
ns = constants . prefixes [ namespace ]
else :
ns = namespace
name = " %s %s " % ( ns , localname )
else :
name = localname
output . append ( " %s %s = \" %s \" " % ( " " * indent , name , value ) )
# self-closing
if type == " EmptyTag " :
indent - = 2
elif type == " EndTag " :
indent - = 2
elif type == " Comment " :
output . append ( " %s <!-- %s --> " % ( " " * indent , token [ " data " ] ) )
elif type == " Doctype " :
if token [ " name " ] :
if token [ " publicId " ] :
output . append ( """ %s <!DOCTYPE %s " %s " " %s " > """ %
( " " * indent ,
token [ " name " ] ,
token [ " publicId " ] ,
token [ " systemId " ] if token [ " systemId " ] else " " ) )
elif token [ " systemId " ] :
output . append ( """ %s <!DOCTYPE %s " " " %s " > """ %
( " " * indent ,
token [ " name " ] ,
token [ " systemId " ] ) )
else :
output . append ( " %s <!DOCTYPE %s > " % ( " " * indent ,
token [ " name " ] ) )
else :
output . append ( " %s <!DOCTYPE > " % ( " " * indent , ) )
elif type == " Characters " :
output . append ( " %s \" %s \" " % ( " " * indent , token [ " data " ] ) )
elif type == " SpaceCharacters " :
assert False , " concatenateCharacterTokens should have got rid of all Space tokens "
else :
raise ValueError ( " Unknown token type, %s " % type )
return " \n " . join ( output )