from __future__ import absolute_import , division , unicode_literals
from six import unichr as chr
from collections import deque , OrderedDict
from sys import version_info
from . constants import spaceCharacters
from . constants import entities
from . constants import asciiLetters , asciiUpper2Lower
from . constants import digits , hexDigits , EOF
from . constants import tokenTypes , tagTokenTypes
from . constants import replacementCharacters
from . _inputstream import HTMLInputStream
from . _trie import Trie
entitiesTrie = Trie ( entities )
if version_info > = ( 3 , 7 ) :
attributeMap = dict
else :
attributeMap = OrderedDict
class HTMLTokenizer ( object ) :
""" This class takes care of tokenizing HTML.
* self . currentToken
Holds the token that is currently being processed .
* self . state
Holds a reference to the method to be invoked . . . XXX
* self . stream
Points to HTMLInputStream object .
"""
def __init__ ( self , stream , parser = None , * * kwargs ) :
self . stream = HTMLInputStream ( stream , * * kwargs )
self . parser = parser
# Setup the initial tokenizer state
self . escapeFlag = False
self . lastFourChars = [ ]
self . state = self . dataState
self . escape = False
# The current token being created
self . currentToken = None
super ( HTMLTokenizer , self ) . __init__ ( )
def __iter__ ( self ) :
""" This is where the magic happens.
We do our usually processing through the states and when we have a token
to return we yield the token which pauses processing until the next token
is requested .
"""
self . tokenQueue = deque ( [ ] )
# Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate.
while self . state ( ) :
while self . stream . errors :
yield { " type " : tokenTypes [ " ParseError " ] , " data " : self . stream . errors . pop ( 0 ) }
while self . tokenQueue :
yield self . tokenQueue . popleft ( )
def consumeNumberEntity ( self , isHex ) :
""" This function returns either U+FFFD or the character based on the
decimal or hexadecimal representation . It also discards " ; " if present .
If not present self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] } ) is invoked .
"""
allowed = digits
radix = 10
if isHex :
allowed = hexDigits
radix = 16
charStack = [ ]
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c = self . stream . char ( )
while c in allowed and c is not EOF :
charStack . append ( c )
c = self . stream . char ( )
# Convert the set of characters consumed to an int.
charAsInt = int ( " " . join ( charStack ) , radix )
# Certain characters get replaced with others
if charAsInt in replacementCharacters :
char = replacementCharacters [ charAsInt ]
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" illegal-codepoint-for-numeric-entity " ,
" datavars " : { " charAsInt " : charAsInt } } )
elif ( ( 0xD800 < = charAsInt < = 0xDFFF ) or
( charAsInt > 0x10FFFF ) ) :
char = " \uFFFD "
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" illegal-codepoint-for-numeric-entity " ,
" datavars " : { " charAsInt " : charAsInt } } )
else :
# Should speed up this check somehow (e.g. move the set to a constant)
if ( ( 0x0001 < = charAsInt < = 0x0008 ) or
( 0x000E < = charAsInt < = 0x001F ) or
( 0x007F < = charAsInt < = 0x009F ) or
( 0xFDD0 < = charAsInt < = 0xFDEF ) or
charAsInt in frozenset ( [ 0x000B , 0xFFFE , 0xFFFF , 0x1FFFE ,
0x1FFFF , 0x2FFFE , 0x2FFFF , 0x3FFFE ,
0x3FFFF , 0x4FFFE , 0x4FFFF , 0x5FFFE ,
0x5FFFF , 0x6FFFE , 0x6FFFF , 0x7FFFE ,
0x7FFFF , 0x8FFFE , 0x8FFFF , 0x9FFFE ,
0x9FFFF , 0xAFFFE , 0xAFFFF , 0xBFFFE ,
0xBFFFF , 0xCFFFE , 0xCFFFF , 0xDFFFE ,
0xDFFFF , 0xEFFFE , 0xEFFFF , 0xFFFFE ,
0xFFFFF , 0x10FFFE , 0x10FFFF ] ) ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " :
" illegal-codepoint-for-numeric-entity " ,
" datavars " : { " charAsInt " : charAsInt } } )
try :
# Try/except needed as UCS-2 Python builds' unichar only works
# within the BMP.
char = chr ( charAsInt )
except ValueError :
v = charAsInt - 0x10000
char = chr ( 0xD800 | ( v >> 10 ) ) + chr ( 0xDC00 | ( v & 0x3FF ) )
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
if c != " ; " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" numeric-entity-without-semicolon " } )
self . stream . unget ( c )
return char
def consumeEntity ( self , allowedChar = None , fromAttribute = False ) :
# Initialise to the default output for when no entity is matched
output = " & "
charStack = [ self . stream . char ( ) ]
if ( charStack [ 0 ] in spaceCharacters or charStack [ 0 ] in ( EOF , " < " , " & " ) or
( allowedChar is not None and allowedChar == charStack [ 0 ] ) ) :
self . stream . unget ( charStack [ 0 ] )
elif charStack [ 0 ] == " # " :
# Read the next character to see if it's hex or decimal
hex = False
charStack . append ( self . stream . char ( ) )
if charStack [ - 1 ] in ( " x " , " X " ) :
hex = True
charStack . append ( self . stream . char ( ) )
# charStack[-1] should be the first digit
if ( hex and charStack [ - 1 ] in hexDigits ) \
or ( not hex and charStack [ - 1 ] in digits ) :
# At least one digit found, so consume the whole number
self . stream . unget ( charStack [ - 1 ] )
output = self . consumeNumberEntity ( hex )
else :
# No digits found
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " expected-numeric-entity " } )
self . stream . unget ( charStack . pop ( ) )
output = " & " + " " . join ( charStack )
else :
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
while ( charStack [ - 1 ] is not EOF ) :
if not entitiesTrie . has_keys_with_prefix ( " " . join ( charStack ) ) :
break
charStack . append ( self . stream . char ( ) )
# At this point we have a string that starts with some characters
# that may match an entity
# Try to find the longest entity the string will match to take care
# of ¬i for instance.
try :
entityName = entitiesTrie . longest_prefix ( " " . join ( charStack [ : - 1 ] ) )
entityLength = len ( entityName )
except KeyError :
entityName = None
if entityName is not None :
if entityName [ - 1 ] != " ; " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" named-entity-without-semicolon " } )
if ( entityName [ - 1 ] != " ; " and fromAttribute and
( charStack [ entityLength ] in asciiLetters or
charStack [ entityLength ] in digits or
charStack [ entityLength ] == " = " ) ) :
self . stream . unget ( charStack . pop ( ) )
output = " & " + " " . join ( charStack )
else :
output = entities [ entityName ]
self . stream . unget ( charStack . pop ( ) )
output + = " " . join ( charStack [ entityLength : ] )
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-named-entity " } )
self . stream . unget ( charStack . pop ( ) )
output = " & " + " " . join ( charStack )
if fromAttribute :
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = output
else :
if output in spaceCharacters :
tokenType = " SpaceCharacters "
else :
tokenType = " Characters "
self . tokenQueue . append ( { " type " : tokenTypes [ tokenType ] , " data " : output } )
def processEntityInAttribute ( self , allowedChar ) :
""" This method replaces the need for " entityInAttributeValueState " .
"""
self . consumeEntity ( allowedChar = allowedChar , fromAttribute = True )
def emitCurrentToken ( self ) :
""" This method is a generic handler for emitting the tags. It also sets
the state to " data " because that ' s what ' s needed after a token has been
emitted .
"""
token = self . currentToken
# Add token to the queue to be yielded
if ( token [ " type " ] in tagTokenTypes ) :
token [ " name " ] = token [ " name " ] . translate ( asciiUpper2Lower )
if token [ " type " ] == tokenTypes [ " StartTag " ] :
raw = token [ " data " ]
data = attributeMap ( raw )
if len ( raw ) > len ( data ) :
# we had some duplicated attribute, fix so first wins
data . update ( raw [ : : - 1 ] )
token [ " data " ] = data
if token [ " type " ] == tokenTypes [ " EndTag " ] :
if token [ " data " ] :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " attributes-in-end-tag " } )
if token [ " selfClosing " ] :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " self-closing-flag-on-end-tag " } )
self . tokenQueue . append ( token )
self . state = self . dataState
# Below are the various tokenizer states worked out.
def dataState ( self ) :
data = self . stream . char ( )
if data == " & " :
self . state = self . entityDataState
elif data == " < " :
self . state = self . tagOpenState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \u0000 " } )
elif data is EOF :
# Tokenization ends.
return False
elif data in spaceCharacters :
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self . tokenQueue . append ( { " type " : tokenTypes [ " SpaceCharacters " ] , " data " :
data + self . stream . charsUntil ( spaceCharacters , True ) } )
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else :
chars = self . stream . charsUntil ( ( " & " , " < " , " \u0000 " ) )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " :
data + chars } )
return True
def entityDataState ( self ) :
self . consumeEntity ( )
self . state = self . dataState
return True
def rcdataState ( self ) :
data = self . stream . char ( )
if data == " & " :
self . state = self . characterReferenceInRcdata
elif data == " < " :
self . state = self . rcdataLessThanSignState
elif data == EOF :
# Tokenization ends.
return False
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
elif data in spaceCharacters :
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self . tokenQueue . append ( { " type " : tokenTypes [ " SpaceCharacters " ] , " data " :
data + self . stream . charsUntil ( spaceCharacters , True ) } )
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else :
chars = self . stream . charsUntil ( ( " & " , " < " , " \u0000 " ) )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " :
data + chars } )
return True
def characterReferenceInRcdata ( self ) :
self . consumeEntity ( )
self . state = self . rcdataState
return True
def rawtextState ( self ) :
data = self . stream . char ( )
if data == " < " :
self . state = self . rawtextLessThanSignState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
elif data == EOF :
# Tokenization ends.
return False
else :
chars = self . stream . charsUntil ( ( " < " , " \u0000 " ) )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " :
data + chars } )
return True
def scriptDataState ( self ) :
data = self . stream . char ( )
if data == " < " :
self . state = self . scriptDataLessThanSignState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
elif data == EOF :
# Tokenization ends.
return False
else :
chars = self . stream . charsUntil ( ( " < " , " \u0000 " ) )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " :
data + chars } )
return True
def plaintextState ( self ) :
data = self . stream . char ( )
if data == EOF :
# Tokenization ends.
return False
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " :
data + self . stream . charsUntil ( " \u0000 " ) } )
return True
def tagOpenState ( self ) :
data = self . stream . char ( )
if data == " ! " :
self . state = self . markupDeclarationOpenState
elif data == " / " :
self . state = self . closeTagOpenState
elif data in asciiLetters :
self . currentToken = { " type " : tokenTypes [ " StartTag " ] ,
" name " : data , " data " : [ ] ,
" selfClosing " : False ,
" selfClosingAcknowledged " : False }
self . state = self . tagNameState
elif data == " > " :
# XXX In theory it could be something besides a tag name. But
# do we really care?
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-tag-name-but-got-right-bracket " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " <> " } )
self . state = self . dataState
elif data == " ? " :
# XXX In theory it could be something besides a tag name. But
# do we really care?
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-tag-name-but-got-question-mark " } )
self . stream . unget ( data )
self . state = self . bogusCommentState
else :
# XXX
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-tag-name " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . stream . unget ( data )
self . state = self . dataState
return True
def closeTagOpenState ( self ) :
data = self . stream . char ( )
if data in asciiLetters :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] , " name " : data ,
" data " : [ ] , " selfClosing " : False }
self . state = self . tagNameState
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-closing-tag-but-got-right-bracket " } )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-closing-tag-but-got-eof " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " </ " } )
self . state = self . dataState
else :
# XXX data can be _'_...
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-closing-tag-but-got-char " ,
" datavars " : { " data " : data } } )
self . stream . unget ( data )
self . state = self . bogusCommentState
return True
def tagNameState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . beforeAttributeNameState
elif data == " > " :
self . emitCurrentToken ( )
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-tag-name " } )
self . state = self . dataState
elif data == " / " :
self . state = self . selfClosingStartTagState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " name " ] + = " \uFFFD "
else :
self . currentToken [ " name " ] + = data
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
def rcdataLessThanSignState ( self ) :
data = self . stream . char ( )
if data == " / " :
self . temporaryBuffer = " "
self . state = self . rcdataEndTagOpenState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . stream . unget ( data )
self . state = self . rcdataState
return True
def rcdataEndTagOpenState ( self ) :
data = self . stream . char ( )
if data in asciiLetters :
self . temporaryBuffer + = data
self . state = self . rcdataEndTagNameState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " </ " } )
self . stream . unget ( data )
self . state = self . rcdataState
return True
def rcdataEndTagNameState ( self ) :
appropriate = self . currentToken and self . currentToken [ " name " ] . lower ( ) == self . temporaryBuffer . lower ( )
data = self . stream . char ( )
if data in spaceCharacters and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . beforeAttributeNameState
elif data == " / " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . selfClosingStartTagState
elif data == " > " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . emitCurrentToken ( )
self . state = self . dataState
elif data in asciiLetters :
self . temporaryBuffer + = data
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " </ " + self . temporaryBuffer } )
self . stream . unget ( data )
self . state = self . rcdataState
return True
def rawtextLessThanSignState ( self ) :
data = self . stream . char ( )
if data == " / " :
self . temporaryBuffer = " "
self . state = self . rawtextEndTagOpenState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . stream . unget ( data )
self . state = self . rawtextState
return True
def rawtextEndTagOpenState ( self ) :
data = self . stream . char ( )
if data in asciiLetters :
self . temporaryBuffer + = data
self . state = self . rawtextEndTagNameState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " </ " } )
self . stream . unget ( data )
self . state = self . rawtextState
return True
def rawtextEndTagNameState ( self ) :
appropriate = self . currentToken and self . currentToken [ " name " ] . lower ( ) == self . temporaryBuffer . lower ( )
data = self . stream . char ( )
if data in spaceCharacters and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . beforeAttributeNameState
elif data == " / " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . selfClosingStartTagState
elif data == " > " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . emitCurrentToken ( )
self . state = self . dataState
elif data in asciiLetters :
self . temporaryBuffer + = data
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " </ " + self . temporaryBuffer } )
self . stream . unget ( data )
self . state = self . rawtextState
return True
def scriptDataLessThanSignState ( self ) :
data = self . stream . char ( )
if data == " / " :
self . temporaryBuffer = " "
self . state = self . scriptDataEndTagOpenState
elif data == " ! " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " <! " } )
self . state = self . scriptDataEscapeStartState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . stream . unget ( data )
self . state = self . scriptDataState
return True
def scriptDataEndTagOpenState ( self ) :
data = self . stream . char ( )
if data in asciiLetters :
self . temporaryBuffer + = data
self . state = self . scriptDataEndTagNameState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " </ " } )
self . stream . unget ( data )
self . state = self . scriptDataState
return True
def scriptDataEndTagNameState ( self ) :
appropriate = self . currentToken and self . currentToken [ " name " ] . lower ( ) == self . temporaryBuffer . lower ( )
data = self . stream . char ( )
if data in spaceCharacters and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . beforeAttributeNameState
elif data == " / " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . selfClosingStartTagState
elif data == " > " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . emitCurrentToken ( )
self . state = self . dataState
elif data in asciiLetters :
self . temporaryBuffer + = data
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " </ " + self . temporaryBuffer } )
self . stream . unget ( data )
self . state = self . scriptDataState
return True
def scriptDataEscapeStartState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
self . state = self . scriptDataEscapeStartDashState
else :
self . stream . unget ( data )
self . state = self . scriptDataState
return True
def scriptDataEscapeStartDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
self . state = self . scriptDataEscapedDashDashState
else :
self . stream . unget ( data )
self . state = self . scriptDataState
return True
def scriptDataEscapedState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
self . state = self . scriptDataEscapedDashState
elif data == " < " :
self . state = self . scriptDataEscapedLessThanSignState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
elif data == EOF :
self . state = self . dataState
else :
chars = self . stream . charsUntil ( ( " < " , " - " , " \u0000 " ) )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " :
data + chars } )
return True
def scriptDataEscapedDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
self . state = self . scriptDataEscapedDashDashState
elif data == " < " :
self . state = self . scriptDataEscapedLessThanSignState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
self . state = self . scriptDataEscapedState
elif data == EOF :
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
self . state = self . scriptDataEscapedState
return True
def scriptDataEscapedDashDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
elif data == " < " :
self . state = self . scriptDataEscapedLessThanSignState
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " > " } )
self . state = self . scriptDataState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
self . state = self . scriptDataEscapedState
elif data == EOF :
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
self . state = self . scriptDataEscapedState
return True
def scriptDataEscapedLessThanSignState ( self ) :
data = self . stream . char ( )
if data == " / " :
self . temporaryBuffer = " "
self . state = self . scriptDataEscapedEndTagOpenState
elif data in asciiLetters :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " + data } )
self . temporaryBuffer = data
self . state = self . scriptDataDoubleEscapeStartState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . stream . unget ( data )
self . state = self . scriptDataEscapedState
return True
def scriptDataEscapedEndTagOpenState ( self ) :
data = self . stream . char ( )
if data in asciiLetters :
self . temporaryBuffer = data
self . state = self . scriptDataEscapedEndTagNameState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " </ " } )
self . stream . unget ( data )
self . state = self . scriptDataEscapedState
return True
def scriptDataEscapedEndTagNameState ( self ) :
appropriate = self . currentToken and self . currentToken [ " name " ] . lower ( ) == self . temporaryBuffer . lower ( )
data = self . stream . char ( )
if data in spaceCharacters and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . beforeAttributeNameState
elif data == " / " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . state = self . selfClosingStartTagState
elif data == " > " and appropriate :
self . currentToken = { " type " : tokenTypes [ " EndTag " ] ,
" name " : self . temporaryBuffer ,
" data " : [ ] , " selfClosing " : False }
self . emitCurrentToken ( )
self . state = self . dataState
elif data in asciiLetters :
self . temporaryBuffer + = data
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " </ " + self . temporaryBuffer } )
self . stream . unget ( data )
self . state = self . scriptDataEscapedState
return True
def scriptDataDoubleEscapeStartState ( self ) :
data = self . stream . char ( )
if data in ( spaceCharacters | frozenset ( ( " / " , " > " ) ) ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
if self . temporaryBuffer . lower ( ) == " script " :
self . state = self . scriptDataDoubleEscapedState
else :
self . state = self . scriptDataEscapedState
elif data in asciiLetters :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
self . temporaryBuffer + = data
else :
self . stream . unget ( data )
self . state = self . scriptDataEscapedState
return True
def scriptDataDoubleEscapedState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
self . state = self . scriptDataDoubleEscapedDashState
elif data == " < " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . state = self . scriptDataDoubleEscapedLessThanSignState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
elif data == EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-script-in-script " } )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
return True
def scriptDataDoubleEscapedDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
self . state = self . scriptDataDoubleEscapedDashDashState
elif data == " < " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . state = self . scriptDataDoubleEscapedLessThanSignState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
self . state = self . scriptDataDoubleEscapedState
elif data == EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-script-in-script " } )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
self . state = self . scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapedDashDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " - " } )
elif data == " < " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " < " } )
self . state = self . scriptDataDoubleEscapedLessThanSignState
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " > " } )
self . state = self . scriptDataState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : " \uFFFD " } )
self . state = self . scriptDataDoubleEscapedState
elif data == EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-script-in-script " } )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
self . state = self . scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapedLessThanSignState ( self ) :
data = self . stream . char ( )
if data == " / " :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : " / " } )
self . temporaryBuffer = " "
self . state = self . scriptDataDoubleEscapeEndState
else :
self . stream . unget ( data )
self . state = self . scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapeEndState ( self ) :
data = self . stream . char ( )
if data in ( spaceCharacters | frozenset ( ( " / " , " > " ) ) ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
if self . temporaryBuffer . lower ( ) == " script " :
self . state = self . scriptDataEscapedState
else :
self . state = self . scriptDataDoubleEscapedState
elif data in asciiLetters :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] , " data " : data } )
self . temporaryBuffer + = data
else :
self . stream . unget ( data )
self . state = self . scriptDataDoubleEscapedState
return True
def beforeAttributeNameState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . stream . charsUntil ( spaceCharacters , True )
elif data in asciiLetters :
self . currentToken [ " data " ] . append ( [ data , " " ] )
self . state = self . attributeNameState
elif data == " > " :
self . emitCurrentToken ( )
elif data == " / " :
self . state = self . selfClosingStartTagState
elif data in ( " ' " , ' " ' , " = " , " < " ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" invalid-character-in-attribute-name " } )
self . currentToken [ " data " ] . append ( [ data , " " ] )
self . state = self . attributeNameState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] . append ( [ " \uFFFD " , " " ] )
self . state = self . attributeNameState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-attribute-name-but-got-eof " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] . append ( [ data , " " ] )
self . state = self . attributeNameState
return True
def attributeNameState ( self ) :
data = self . stream . char ( )
leavingThisState = True
emitToken = False
if data == " = " :
self . state = self . beforeAttributeValueState
elif data in asciiLetters :
self . currentToken [ " data " ] [ - 1 ] [ 0 ] + = data + \
self . stream . charsUntil ( asciiLetters , True )
leavingThisState = False
elif data == " > " :
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
emitToken = True
elif data in spaceCharacters :
self . state = self . afterAttributeNameState
elif data == " / " :
self . state = self . selfClosingStartTagState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] [ - 1 ] [ 0 ] + = " \uFFFD "
leavingThisState = False
elif data in ( " ' " , ' " ' , " < " ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " :
" invalid-character-in-attribute-name " } )
self . currentToken [ " data " ] [ - 1 ] [ 0 ] + = data
leavingThisState = False
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " eof-in-attribute-name " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] [ - 1 ] [ 0 ] + = data
leavingThisState = False
if leavingThisState :
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
self . currentToken [ " data " ] [ - 1 ] [ 0 ] = (
self . currentToken [ " data " ] [ - 1 ] [ 0 ] . translate ( asciiUpper2Lower ) )
for name , _ in self . currentToken [ " data " ] [ : - 1 ] :
if self . currentToken [ " data " ] [ - 1 ] [ 0 ] == name :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" duplicate-attribute " } )
break
# XXX Fix for above XXX
if emitToken :
self . emitCurrentToken ( )
return True
def afterAttributeNameState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . stream . charsUntil ( spaceCharacters , True )
elif data == " = " :
self . state = self . beforeAttributeValueState
elif data == " > " :
self . emitCurrentToken ( )
elif data in asciiLetters :
self . currentToken [ " data " ] . append ( [ data , " " ] )
self . state = self . attributeNameState
elif data == " / " :
self . state = self . selfClosingStartTagState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] . append ( [ " \uFFFD " , " " ] )
self . state = self . attributeNameState
elif data in ( " ' " , ' " ' , " < " ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" invalid-character-after-attribute-name " } )
self . currentToken [ " data " ] . append ( [ data , " " ] )
self . state = self . attributeNameState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-end-of-tag-but-got-eof " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] . append ( [ data , " " ] )
self . state = self . attributeNameState
return True
def beforeAttributeValueState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . stream . charsUntil ( spaceCharacters , True )
elif data == " \" " :
self . state = self . attributeValueDoubleQuotedState
elif data == " & " :
self . state = self . attributeValueUnQuotedState
self . stream . unget ( data )
elif data == " ' " :
self . state = self . attributeValueSingleQuotedState
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-attribute-value-but-got-right-bracket " } )
self . emitCurrentToken ( )
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = " \uFFFD "
self . state = self . attributeValueUnQuotedState
elif data in ( " = " , " < " , " ` " ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" equals-in-unquoted-attribute-value " } )
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = data
self . state = self . attributeValueUnQuotedState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-attribute-value-but-got-eof " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = data
self . state = self . attributeValueUnQuotedState
return True
def attributeValueDoubleQuotedState ( self ) :
data = self . stream . char ( )
if data == " \" " :
self . state = self . afterAttributeValueState
elif data == " & " :
self . processEntityInAttribute ( ' " ' )
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = " \uFFFD "
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-attribute-value-double-quote " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = data + \
self . stream . charsUntil ( ( " \" " , " & " , " \u0000 " ) )
return True
def attributeValueSingleQuotedState ( self ) :
data = self . stream . char ( )
if data == " ' " :
self . state = self . afterAttributeValueState
elif data == " & " :
self . processEntityInAttribute ( " ' " )
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = " \uFFFD "
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-attribute-value-single-quote " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = data + \
self . stream . charsUntil ( ( " ' " , " & " , " \u0000 " ) )
return True
def attributeValueUnQuotedState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . beforeAttributeNameState
elif data == " & " :
self . processEntityInAttribute ( " > " )
elif data == " > " :
self . emitCurrentToken ( )
elif data in ( ' " ' , " ' " , " = " , " < " , " ` " ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-character-in-unquoted-attribute-value " } )
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = data
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = " \uFFFD "
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-attribute-value-no-quotes " } )
self . state = self . dataState
else :
self . currentToken [ " data " ] [ - 1 ] [ 1 ] + = data + self . stream . charsUntil (
frozenset ( ( " & " , " > " , ' " ' , " ' " , " = " , " < " , " ` " , " \u0000 " ) ) | spaceCharacters )
return True
def afterAttributeValueState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . beforeAttributeNameState
elif data == " > " :
self . emitCurrentToken ( )
elif data == " / " :
self . state = self . selfClosingStartTagState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-EOF-after-attribute-value " } )
self . stream . unget ( data )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-character-after-attribute-value " } )
self . stream . unget ( data )
self . state = self . beforeAttributeNameState
return True
def selfClosingStartTagState ( self ) :
data = self . stream . char ( )
if data == " > " :
self . currentToken [ " selfClosing " ] = True
self . emitCurrentToken ( )
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " :
" unexpected-EOF-after-solidus-in-tag " } )
self . stream . unget ( data )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-character-after-solidus-in-tag " } )
self . stream . unget ( data )
self . state = self . beforeAttributeNameState
return True
def bogusCommentState ( self ) :
# Make a new comment token and give it as value all the characters
# until the first > or EOF (charsUntil checks for EOF automatically)
# and emit it.
data = self . stream . charsUntil ( " > " )
data = data . replace ( " \u0000 " , " \uFFFD " )
self . tokenQueue . append (
{ " type " : tokenTypes [ " Comment " ] , " data " : data } )
# Eat the character directly after the bogus comment which is either a
# ">" or an EOF.
self . stream . char ( )
self . state = self . dataState
return True
def markupDeclarationOpenState ( self ) :
charStack = [ self . stream . char ( ) ]
if charStack [ - 1 ] == " - " :
charStack . append ( self . stream . char ( ) )
if charStack [ - 1 ] == " - " :
self . currentToken = { " type " : tokenTypes [ " Comment " ] , " data " : " " }
self . state = self . commentStartState
return True
elif charStack [ - 1 ] in ( ' d ' , ' D ' ) :
matched = True
for expected in ( ( ' o ' , ' O ' ) , ( ' c ' , ' C ' ) , ( ' t ' , ' T ' ) ,
( ' y ' , ' Y ' ) , ( ' p ' , ' P ' ) , ( ' e ' , ' E ' ) ) :
charStack . append ( self . stream . char ( ) )
if charStack [ - 1 ] not in expected :
matched = False
break
if matched :
self . currentToken = { " type " : tokenTypes [ " Doctype " ] ,
" name " : " " ,
" publicId " : None , " systemId " : None ,
" correct " : True }
self . state = self . doctypeState
return True
elif ( charStack [ - 1 ] == " [ " and
self . parser is not None and
self . parser . tree . openElements and
self . parser . tree . openElements [ - 1 ] . namespace != self . parser . tree . defaultNamespace ) :
matched = True
for expected in [ " C " , " D " , " A " , " T " , " A " , " [ " ] :
charStack . append ( self . stream . char ( ) )
if charStack [ - 1 ] != expected :
matched = False
break
if matched :
self . state = self . cdataSectionState
return True
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-dashes-or-doctype " } )
while charStack :
self . stream . unget ( charStack . pop ( ) )
self . state = self . bogusCommentState
return True
def commentStartState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . state = self . commentStartDashState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] + = " \uFFFD "
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" incorrect-comment " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-comment " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " data " ] + = data
self . state = self . commentState
return True
def commentStartDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . state = self . commentEndState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] + = " - \uFFFD "
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" incorrect-comment " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-comment " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " data " ] + = " - " + data
self . state = self . commentState
return True
def commentState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . state = self . commentEndDashState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] + = " \uFFFD "
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " eof-in-comment " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " data " ] + = data + \
self . stream . charsUntil ( ( " - " , " \u0000 " ) )
return True
def commentEndDashState ( self ) :
data = self . stream . char ( )
if data == " - " :
self . state = self . commentEndState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] + = " - \uFFFD "
self . state = self . commentState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-comment-end-dash " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " data " ] + = " - " + data
self . state = self . commentState
return True
def commentEndState ( self ) :
data = self . stream . char ( )
if data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] + = " -- \uFFFD "
self . state = self . commentState
elif data == " ! " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-bang-after-double-dash-in-comment " } )
self . state = self . commentEndBangState
elif data == " - " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-dash-after-double-dash-in-comment " } )
self . currentToken [ " data " ] + = data
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-comment-double-dash " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
# XXX
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-comment " } )
self . currentToken [ " data " ] + = " -- " + data
self . state = self . commentState
return True
def commentEndBangState ( self ) :
data = self . stream . char ( )
if data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data == " - " :
self . currentToken [ " data " ] + = " --! "
self . state = self . commentEndDashState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " data " ] + = " --! \uFFFD "
self . state = self . commentState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-comment-end-bang-state " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " data " ] + = " --! " + data
self . state = self . commentState
return True
def doctypeState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . beforeDoctypeNameState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-doctype-name-but-got-eof " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" need-space-after-doctype " } )
self . stream . unget ( data )
self . state = self . beforeDoctypeNameState
return True
def beforeDoctypeNameState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
pass
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-doctype-name-but-got-right-bracket " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " name " ] = " \uFFFD "
self . state = self . doctypeNameState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-doctype-name-but-got-eof " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " name " ] = data
self . state = self . doctypeNameState
return True
def doctypeNameState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . currentToken [ " name " ] = self . currentToken [ " name " ] . translate ( asciiUpper2Lower )
self . state = self . afterDoctypeNameState
elif data == " > " :
self . currentToken [ " name " ] = self . currentToken [ " name " ] . translate ( asciiUpper2Lower )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " name " ] + = " \uFFFD "
self . state = self . doctypeNameState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype-name " } )
self . currentToken [ " correct " ] = False
self . currentToken [ " name " ] = self . currentToken [ " name " ] . translate ( asciiUpper2Lower )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " name " ] + = data
return True
def afterDoctypeNameState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
pass
elif data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . currentToken [ " correct " ] = False
self . stream . unget ( data )
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
if data in ( " p " , " P " ) :
matched = True
for expected in ( ( " u " , " U " ) , ( " b " , " B " ) , ( " l " , " L " ) ,
( " i " , " I " ) , ( " c " , " C " ) ) :
data = self . stream . char ( )
if data not in expected :
matched = False
break
if matched :
self . state = self . afterDoctypePublicKeywordState
return True
elif data in ( " s " , " S " ) :
matched = True
for expected in ( ( " y " , " Y " ) , ( " s " , " S " ) , ( " t " , " T " ) ,
( " e " , " E " ) , ( " m " , " M " ) ) :
data = self . stream . char ( )
if data not in expected :
matched = False
break
if matched :
self . state = self . afterDoctypeSystemKeywordState
return True
# All the characters read before the current 'data' will be
# [a-zA-Z], so they're garbage in the bogus doctype and can be
# discarded; only the latest character might be '>' or EOF
# and needs to be ungetted
self . stream . unget ( data )
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" expected-space-or-right-bracket-in-doctype " , " datavars " :
{ " data " : data } } )
self . currentToken [ " correct " ] = False
self . state = self . bogusDoctypeState
return True
def afterDoctypePublicKeywordState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . beforeDoctypePublicIdentifierState
elif data in ( " ' " , ' " ' ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . stream . unget ( data )
self . state = self . beforeDoctypePublicIdentifierState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . stream . unget ( data )
self . state = self . beforeDoctypePublicIdentifierState
return True
def beforeDoctypePublicIdentifierState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
pass
elif data == " \" " :
self . currentToken [ " publicId " ] = " "
self . state = self . doctypePublicIdentifierDoubleQuotedState
elif data == " ' " :
self . currentToken [ " publicId " ] = " "
self . state = self . doctypePublicIdentifierSingleQuotedState
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-end-of-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " correct " ] = False
self . state = self . bogusDoctypeState
return True
def doctypePublicIdentifierDoubleQuotedState ( self ) :
data = self . stream . char ( )
if data == " \" " :
self . state = self . afterDoctypePublicIdentifierState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " publicId " ] + = " \uFFFD "
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-end-of-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " publicId " ] + = data
return True
def doctypePublicIdentifierSingleQuotedState ( self ) :
data = self . stream . char ( )
if data == " ' " :
self . state = self . afterDoctypePublicIdentifierState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " publicId " ] + = " \uFFFD "
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-end-of-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " publicId " ] + = data
return True
def afterDoctypePublicIdentifierState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . betweenDoctypePublicAndSystemIdentifiersState
elif data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data == ' " ' :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " systemId " ] = " "
self . state = self . doctypeSystemIdentifierDoubleQuotedState
elif data == " ' " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " systemId " ] = " "
self . state = self . doctypeSystemIdentifierSingleQuotedState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " correct " ] = False
self . state = self . bogusDoctypeState
return True
def betweenDoctypePublicAndSystemIdentifiersState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
pass
elif data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data == ' " ' :
self . currentToken [ " systemId " ] = " "
self . state = self . doctypeSystemIdentifierDoubleQuotedState
elif data == " ' " :
self . currentToken [ " systemId " ] = " "
self . state = self . doctypeSystemIdentifierSingleQuotedState
elif data == EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " correct " ] = False
self . state = self . bogusDoctypeState
return True
def afterDoctypeSystemKeywordState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
self . state = self . beforeDoctypeSystemIdentifierState
elif data in ( " ' " , ' " ' ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . stream . unget ( data )
self . state = self . beforeDoctypeSystemIdentifierState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . stream . unget ( data )
self . state = self . beforeDoctypeSystemIdentifierState
return True
def beforeDoctypeSystemIdentifierState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
pass
elif data == " \" " :
self . currentToken [ " systemId " ] = " "
self . state = self . doctypeSystemIdentifierDoubleQuotedState
elif data == " ' " :
self . currentToken [ " systemId " ] = " "
self . state = self . doctypeSystemIdentifierSingleQuotedState
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . currentToken [ " correct " ] = False
self . state = self . bogusDoctypeState
return True
def doctypeSystemIdentifierDoubleQuotedState ( self ) :
data = self . stream . char ( )
if data == " \" " :
self . state = self . afterDoctypeSystemIdentifierState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " systemId " ] + = " \uFFFD "
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-end-of-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " systemId " ] + = data
return True
def doctypeSystemIdentifierSingleQuotedState ( self ) :
data = self . stream . char ( )
if data == " ' " :
self . state = self . afterDoctypeSystemIdentifierState
elif data == " \u0000 " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
self . currentToken [ " systemId " ] + = " \uFFFD "
elif data == " > " :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-end-of-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . currentToken [ " systemId " ] + = data
return True
def afterDoctypeSystemIdentifierState ( self ) :
data = self . stream . char ( )
if data in spaceCharacters :
pass
elif data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" eof-in-doctype " } )
self . currentToken [ " correct " ] = False
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] , " data " :
" unexpected-char-in-doctype " } )
self . state = self . bogusDoctypeState
return True
def bogusDoctypeState ( self ) :
data = self . stream . char ( )
if data == " > " :
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
elif data is EOF :
# XXX EMIT
self . stream . unget ( data )
self . tokenQueue . append ( self . currentToken )
self . state = self . dataState
else :
pass
return True
def cdataSectionState ( self ) :
data = [ ]
while True :
data . append ( self . stream . charsUntil ( " ] " ) )
data . append ( self . stream . charsUntil ( " > " ) )
char = self . stream . char ( )
if char == EOF :
break
else :
assert char == " > "
if data [ - 1 ] [ - 2 : ] == " ]] " :
data [ - 1 ] = data [ - 1 ] [ : - 2 ]
break
else :
data . append ( char )
data = " " . join ( data ) # pylint:disable=redefined-variable-type
# Deal with null here rather than in the parser
nullCount = data . count ( " \u0000 " )
if nullCount > 0 :
for _ in range ( nullCount ) :
self . tokenQueue . append ( { " type " : tokenTypes [ " ParseError " ] ,
" data " : " invalid-codepoint " } )
data = data . replace ( " \u0000 " , " \uFFFD " )
if data :
self . tokenQueue . append ( { " type " : tokenTypes [ " Characters " ] ,
" data " : data } )
self . state = self . dataState
return True