from __future__ import absolute_import, division, unicode_literals from six import unichr as chr from collections import deque from .constants import spaceCharacters from .constants import entities from .constants import asciiLetters, asciiUpper2Lower from .constants import digits, hexDigits, EOF from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters from ._inputstream import HTMLInputStream from ._trie import Trie entitiesTrie = Trie(entities) class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. """ def __init__(self, stream, parser=None, **kwargs): self.stream = HTMLInputStream(stream, **kwargs) self.parser = parser # Setup the initial tokenizer state self.escapeFlag = False self.lastFourChars = [] self.state = self.dataState self.escape = False # The current token being created self.currentToken = None super(HTMLTokenizer, self).__init__() def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.tokenQueue = deque([]) # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.popleft() def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. """ allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) # Certain characters get replaced with others if charAsInt in replacementCharacters: char = replacementCharacters[charAsInt] self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) elif ((0xD800 <= charAsInt <= 0xDFFF) or (charAsInt > 0x10FFFF)): char = "\uFFFD" self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) else: # Should speed up this check somehow (e.g. move the set to a constant) if ((0x0001 <= charAsInt <= 0x0008) or (0x000E <= charAsInt <= 0x001F) or (0x007F <= charAsInt <= 0x009F) or (0xFDD0 <= charAsInt <= 0xFDEF) or charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF])): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) try: # Try/except needed as UCS-2 Python builds' unichar only works # within the BMP. char = chr(charAsInt) except ValueError: v = charAsInt - 0x10000 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != ";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "numeric-entity-without-semicolon"}) self.stream.unget(c) return char def consumeEntity(self, allowedChar=None, fromAttribute=False): # Initialise to the default output for when no entity is matched output = "&" charStack = [self.stream.char()] if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or (allowedChar is not None and allowedChar == charStack[0])): self.stream.unget(charStack[0]) elif charStack[0] == "#": # Read the next character to see if it's hex or decimal hex = False charStack.append(self.stream.char()) if charStack[-1] in ("x", "X"): hex = True charStack.append(self.stream.char()) # charStack[-1] should be the first digit if (hex and charStack[-1] in hexDigits) \ or (not hex and charStack[-1] in digits): # At least one digit found, so consume the whole number self.stream.unget(charStack[-1]) output = self.consumeNumberEntity(hex) else: # No digits found self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-numeric-entity"}) self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. while (charStack[-1] is not EOF): if not entitiesTrie.has_keys_with_prefix("".join(charStack)): break charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity # Try to find the longest entity the string will match to take care # of ¬i for instance. try: entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) entityLength = len(entityName) except KeyError: entityName = None if entityName is not None: if entityName[-1] != ";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "named-entity-without-semicolon"}) if (entityName[-1] != ";" and fromAttribute and (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits or charStack[entityLength] == "=")): self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) else: output = entities[entityName] self.stream.unget(charStack.pop()) output += "".join(charStack[entityLength:]) else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-named-entity"}) self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) if fromAttribute: self.currentToken["data"][-1][1] += output else: if output in spaceCharacters: tokenType = "SpaceCharacters" else: tokenType = "Characters" self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". """ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["EndTag"]: if token["data"]: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "attributes-in-end-tag"}) if token["selfClosing"]: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "self-closing-flag-on-end-tag"}) self.tokenQueue.append(token) self.state = self.dataState # Below are the various tokenizer states worked out. def dataState(self): data = self.stream.char() if data == "&": self.state = self.entityDataState elif data == "<": self.state = self.tagOpenState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\u0000"}) elif data is EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any sequences else: chars = self.stream.charsUntil(("&", "<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def entityDataState(self): self.consumeEntity() self.state = self.dataState return True def rcdataState(self): data = self.stream.char() if data == "&": self.state = self.characterReferenceInRcdata elif data == "<": self.state = self.rcdataLessThanSignState elif data == EOF: # Tokenization ends. return False elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any sequences else: chars = self.stream.charsUntil(("&", "<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def characterReferenceInRcdata(self): self.consumeEntity() self.state = self.rcdataState return True def rawtextState(self): data = self.stream.char() if data == "<": self.state = self.rawtextLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data == EOF: # Tokenization ends. return False else: chars = self.stream.charsUntil(("<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def scriptDataState(self): data = self.stream.char() if data == "<": self.state = self.scriptDataLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data == EOF: # Tokenization ends. return False else: chars = self.stream.charsUntil(("<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def plaintextState(self): data = self.stream.char() if data == EOF: # Tokenization ends. return False elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + self.stream.charsUntil("\u0000")}) return True def tagOpenState(self): data = self.stream.char() if data == "!": self.state = self.markupDeclarationOpenState elif data == "/": self.state = self.closeTagOpenState elif data in asciiLetters: self.currentToken = {"type": tokenTypes["StartTag"], "name": data, "data": [], "selfClosing": False, "selfClosingAcknowledged": False} self.state = self.tagNameState elif data == ">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-right-bracket"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) self.state = self.dataState elif data == "?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-question-mark"}) self.stream.unget(data) self.state = self.bogusCommentState else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.stream.unget(data) self.state = self.dataState return True def closeTagOpenState(self): data = self.stream.char() if data in asciiLetters: self.currentToken = {"type": tokenTypes["EndTag"], "name": data, "data": [], "selfClosing": False} self.state = self.tagNameState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-right-bracket"}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-eof"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "": self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-tag-name"}) self.state = self.dataState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["name"] += "\uFFFD" else: self.currentToken["name"] += data # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True def rcdataLessThanSignState(self): data = self.stream.char() if data == "/": self.temporaryBuffer = "" self.state = self.rcdataEndTagOpenState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.stream.unget(data) self.state = self.rcdataState return True def rcdataEndTagOpenState(self): data = self.stream.char() if data in asciiLetters: self.temporaryBuffer += data self.state = self.rcdataEndTagNameState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) self.state = self.scriptDataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) self.state = self.scriptDataEscapedState elif data == EOF: self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.scriptDataEscapedState return True def scriptDataEscapedLessThanSignState(self): data = self.stream.char() if data == "/": self.temporaryBuffer = "" self.state = self.scriptDataEscapedEndTagOpenState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) self.temporaryBuffer = data self.state = self.scriptDataDoubleEscapeStartState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.stream.unget(data) self.state = self.scriptDataEscapedState return True def scriptDataEscapedEndTagOpenState(self): data = self.stream.char() if data in asciiLetters: self.temporaryBuffer = data self.state = self.scriptDataEscapedEndTagNameState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataDoubleEscapedState else: self.state = self.scriptDataEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.temporaryBuffer += data else: self.stream.unget(data) self.state = self.scriptDataEscapedState return True def scriptDataDoubleEscapedState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) self.state = self.scriptDataDoubleEscapedDashState elif data == "<": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-script-in-script"}) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) return True def scriptDataDoubleEscapedDashState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) self.state = self.scriptDataDoubleEscapedDashDashState elif data == "<": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) self.state = self.scriptDataDoubleEscapedState elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-script-in-script"}) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapedDashDashState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) elif data == "<": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == ">": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) self.state = self.scriptDataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) self.state = self.scriptDataDoubleEscapedState elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-script-in-script"}) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapedLessThanSignState(self): data = self.stream.char() if data == "/": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) self.temporaryBuffer = "" self.state = self.scriptDataDoubleEscapeEndState else: self.stream.unget(data) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapeEndState(self): data = self.stream.char() if data in (spaceCharacters | frozenset(("/", ">"))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataEscapedState else: self.state = self.scriptDataDoubleEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.temporaryBuffer += data else: self.stream.unget(data) self.state = self.scriptDataDoubleEscapedState return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == ">": self.emitCurrentToken() elif data == "/": self.state = self.selfClosingStartTagState elif data in ("'", '"', "=", "<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"].append(["\uFFFD", ""]) self.state = self.attributeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-name-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState return True def attributeNameState(self): data = self.stream.char() leavingThisState = True emitToken = False if data == "=": self.state = self.beforeAttributeValueState elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == ">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list emitToken = True elif data in spaceCharacters: self.state = self.afterAttributeNameState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][0] += "\uFFFD" leavingThisState = False elif data in ("'", '"', "<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"][-1][0] += data leavingThisState = False elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-name"}) self.state = self.dataState else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "duplicate-attribute"}) break # XXX Fix for above XXX if emitToken: self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == "=": self.state = self.beforeAttributeValueState elif data == ">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"].append(["\uFFFD", ""]) self.state = self.attributeNameState elif data in ("'", '"', "<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-after-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-end-of-tag-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == "\"": self.state = self.attributeValueDoubleQuotedState elif data == "&": self.state = self.attributeValueUnQuotedState self.stream.unget(data) elif data == "'": self.state = self.attributeValueSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-right-bracket"}) self.emitCurrentToken() elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" self.state = self.attributeValueUnQuotedState elif data in ("=", "<", "`"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "equals-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data self.state = self.attributeValueUnQuotedState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data self.state = self.attributeValueUnQuotedState return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterAttributeValueState elif data == "&": self.processEntityInAttribute('"') elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-double-quote"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", "&", "\u0000")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterAttributeValueState elif data == "&": self.processEntityInAttribute("'") elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-single-quote"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", "&", "\u0000")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == "&": self.processEntityInAttribute(">") elif data == ">": self.emitCurrentToken() elif data in ('"', "'", "=", "<", "`"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-no-quotes"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) return True def afterAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == ">": self.emitCurrentToken() elif data == "/": self.state = self.selfClosingStartTagState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-attribute-value"}) self.stream.unget(data) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-attribute-value"}) self.stream.unget(data) self.state = self.beforeAttributeNameState return True def selfClosingStartTagState(self): data = self.stream.char() if data == ">": self.currentToken["selfClosing"] = True self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-solidus-in-tag"}) self.stream.unget(data) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-solidus-in-tag"}) self.stream.unget(data) self.state = self.beforeAttributeNameState return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. data = self.stream.charsUntil(">") data = data.replace("\u0000", "\uFFFD") self.tokenQueue.append( {"type": tokenTypes["Comment"], "data": data}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.dataState return True def markupDeclarationOpenState(self): charStack = [self.stream.char()] if charStack[-1] == "-": charStack.append(self.stream.char()) if charStack[-1] == "-": self.currentToken = {"type": tokenTypes["Comment"], "data": ""} self.state = self.commentStartState return True elif charStack[-1] in ('d', 'D'): matched = True for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), ('y', 'Y'), ('p', 'P'), ('e', 'E')): charStack.append(self.stream.char()) if charStack[-1] not in expected: matched = False break if matched: self.currentToken = {"type": tokenTypes["Doctype"], "name": "", "publicId": None, "systemId": None, "correct": True} self.state = self.doctypeState return True elif (charStack[-1] == "[" and self.parser is not None and self.parser.tree.openElements and self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): matched = True for expected in ["C", "D", "A", "T", "A", "["]: charStack.append(self.stream.char()) if charStack[-1] != expected: matched = False break if matched: self.state = self.cdataSectionState return True self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-dashes-or-doctype"}) while charStack: self.stream.unget(charStack.pop()) self.state = self.bogusCommentState return True def commentStartState(self): data = self.stream.char() if data == "-": self.state = self.commentStartDashState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data self.state = self.commentState return True def commentStartDashState(self): data = self.stream.char() if data == "-": self.state = self.commentEndState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "-\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "-" + data self.state = self.commentState return True def commentState(self): data = self.stream.char() if data == "-": self.state = self.commentEndDashState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data + \ self.stream.charsUntil(("-", "\u0000")) return True def commentEndDashState(self): data = self.stream.char() if data == "-": self.state = self.commentEndState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "-\uFFFD" self.state = self.commentState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "-" + data self.state = self.commentState return True def commentEndState(self): data = self.stream.char() if data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "--\uFFFD" self.state = self.commentState elif data == "!": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-bang-after-double-dash-in-comment"}) self.state = self.commentEndBangState elif data == "-": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-dash-after-double-dash-in-comment"}) self.currentToken["data"] += data elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-double-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-comment"}) self.currentToken["data"] += "--" + data self.state = self.commentState return True def commentEndBangState(self): data = self.stream.char() if data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "-": self.currentToken["data"] += "--!" self.state = self.commentEndDashState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "--!\uFFFD" self.state = self.commentState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-bang-state"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "--!" + data self.state = self.commentState return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "need-space-after-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypeNameState return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-right-bracket"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["name"] = "\uFFFD" self.state = self.doctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["name"] = data self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.afterDoctypeNameState elif data == ">": self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["name"] += "\uFFFD" self.state = self.doctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["name"] += data return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.currentToken["correct"] = False self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: if data in ("p", "P"): matched = True for expected in (("u", "U"), ("b", "B"), ("l", "L"), ("i", "I"), ("c", "C")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.afterDoctypePublicKeywordState return True elif data in ("s", "S"): matched = True for expected in (("y", "Y"), ("s", "S"), ("t", "T"), ("e", "E"), ("m", "M")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.afterDoctypeSystemKeywordState return True # All the characters read before the current 'data' will be # [a-zA-Z], so they're garbage in the bogus doctype and can be # discarded; only the latest character might be '>' or EOF # and needs to be ungetted self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-space-or-right-bracket-in-doctype", "datavars": {"data": data}}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def afterDoctypePublicKeywordState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypePublicIdentifierState elif data in ("'", '"'): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypePublicIdentifierState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.stream.unget(data) self.state = self.beforeDoctypePublicIdentifierState return True def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = "" self.state = self.doctypePublicIdentifierDoubleQuotedState elif data == "'": self.currentToken["publicId"] = "" self.state = self.doctypePublicIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterDoctypePublicIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["publicId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["publicId"] += data return True def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterDoctypePublicIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["publicId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["publicId"] += data return True def afterDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.betweenDoctypePublicAndSystemIdentifiersState elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == '"': self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def betweenDoctypePublicAndSystemIdentifiersState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == '"': self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def afterDoctypeSystemKeywordState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypeSystemIdentifierState elif data in ("'", '"'): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypeSystemIdentifierState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.stream.unget(data) self.state = self.beforeDoctypeSystemIdentifierState return True def beforeDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterDoctypeSystemIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["systemId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["systemId"] += data return True def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterDoctypeSystemIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["systemId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["systemId"] += data return True def afterDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.state = self.bogusDoctypeState return True def bogusDoctypeState(self): data = self.stream.char() if data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: # XXX EMIT self.stream.unget(data) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: pass return True def cdataSectionState(self): data = [] while True: data.append(self.stream.charsUntil("]")) data.append(self.stream.charsUntil(">")) char = self.stream.char() if char == EOF: break else: assert char == ">" if data[-1][-2:] == "]]": data[-1] = data[-1][:-2] break else: data.append(char) data = "".join(data) # pylint:disable=redefined-variable-type # Deal with null here rather than in the parser nullCount = data.count("\u0000") if nullCount > 0: for _ in range(nullCount): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) data = data.replace("\u0000", "\uFFFD") if data: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.dataState return True