from __future__ import absolute_import, division, unicode_literals from six import unichr as chr from collections import deque, OrderedDict from sys import version_info from .constants import spaceCharacters from .constants import entities from .constants import asciiLetters, asciiUpper2Lower from .constants import digits, hexDigits, EOF from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters from ._inputstream import HTMLInputStream from ._trie import Trie entitiesTrie = Trie(entities) if version_info >= (3, 7): attributeMap = dict else: attributeMap = OrderedDict class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. * self.currentToken Holds the token that is currently being processed. * self.state Holds a reference to the method to be invoked... XXX * self.stream Points to HTMLInputStream object. """ def __init__(self, stream, parser=None, **kwargs): self.stream = HTMLInputStream(stream, **kwargs) self.parser = parser # Setup the initial tokenizer state self.escapeFlag = False self.lastFourChars = [] self.state = self.dataState self.escape = False # The current token being created self.currentToken = None super(HTMLTokenizer, self).__init__() def __iter__(self): """ This is where the magic happens. We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested. """ self.tokenQueue = deque([]) # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): while self.stream.errors: yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.popleft() def consumeNumberEntity(self, isHex): """This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. """ allowed = digits radix = 10 if isHex: allowed = hexDigits radix = 16 charStack = [] # Consume all the characters that are in range while making sure we # don't hit an EOF. c = self.stream.char() while c in allowed and c is not EOF: charStack.append(c) c = self.stream.char() # Convert the set of characters consumed to an int. charAsInt = int("".join(charStack), radix) # Certain characters get replaced with others if charAsInt in replacementCharacters: char = replacementCharacters[charAsInt] self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) elif ((0xD800 <= charAsInt <= 0xDFFF) or (charAsInt > 0x10FFFF)): char = "\uFFFD" self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) else: # Should speed up this check somehow (e.g. move the set to a constant) if ((0x0001 <= charAsInt <= 0x0008) or (0x000E <= charAsInt <= 0x001F) or (0x007F <= charAsInt <= 0x009F) or (0xFDD0 <= charAsInt <= 0xFDEF) or charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF])): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "illegal-codepoint-for-numeric-entity", "datavars": {"charAsInt": charAsInt}}) try: # Try/except needed as UCS-2 Python builds' unichar only works # within the BMP. char = chr(charAsInt) except ValueError: v = charAsInt - 0x10000 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. if c != ";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "numeric-entity-without-semicolon"}) self.stream.unget(c) return char def consumeEntity(self, allowedChar=None, fromAttribute=False): # Initialise to the default output for when no entity is matched output = "&" charStack = [self.stream.char()] if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or (allowedChar is not None and allowedChar == charStack[0])): self.stream.unget(charStack[0]) elif charStack[0] == "#": # Read the next character to see if it's hex or decimal hex = False charStack.append(self.stream.char()) if charStack[-1] in ("x", "X"): hex = True charStack.append(self.stream.char()) # charStack[-1] should be the first digit if (hex and charStack[-1] in hexDigits) \ or (not hex and charStack[-1] in digits): # At least one digit found, so consume the whole number self.stream.unget(charStack[-1]) output = self.consumeNumberEntity(hex) else: # No digits found self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-numeric-entity"}) self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) else: # At this point in the process might have named entity. Entities # are stored in the global variable "entities". # # Consume characters and compare to these to a substring of the # entity names in the list until the substring no longer matches. while (charStack[-1] is not EOF): if not entitiesTrie.has_keys_with_prefix("".join(charStack)): break charStack.append(self.stream.char()) # At this point we have a string that starts with some characters # that may match an entity # Try to find the longest entity the string will match to take care # of ¬i for instance. try: entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) entityLength = len(entityName) except KeyError: entityName = None if entityName is not None: if entityName[-1] != ";": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "named-entity-without-semicolon"}) if (entityName[-1] != ";" and fromAttribute and (charStack[entityLength] in asciiLetters or charStack[entityLength] in digits or charStack[entityLength] == "=")): self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) else: output = entities[entityName] self.stream.unget(charStack.pop()) output += "".join(charStack[entityLength:]) else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-named-entity"}) self.stream.unget(charStack.pop()) output = "&" + "".join(charStack) if fromAttribute: self.currentToken["data"][-1][1] += output else: if output in spaceCharacters: tokenType = "SpaceCharacters" else: tokenType = "Characters" self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) def processEntityInAttribute(self, allowedChar): """This method replaces the need for "entityInAttributeValueState". """ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) def emitCurrentToken(self): """This method is a generic handler for emitting the tags. It also sets the state to "data" because that's what's needed after a token has been emitted. """ token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): token["name"] = token["name"].translate(asciiUpper2Lower) if token["type"] == tokenTypes["StartTag"]: raw = token["data"] data = attributeMap(raw) if len(raw) > len(data): # we had some duplicated attribute, fix so first wins data.update(raw[::-1]) token["data"] = data if token["type"] == tokenTypes["EndTag"]: if token["data"]: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "attributes-in-end-tag"}) if token["selfClosing"]: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "self-closing-flag-on-end-tag"}) self.tokenQueue.append(token) self.state = self.dataState # Below are the various tokenizer states worked out. def dataState(self): data = self.stream.char() if data == "&": self.state = self.entityDataState elif data == "<": self.state = self.tagOpenState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\u0000"}) elif data is EOF: # Tokenization ends. return False elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any sequences else: chars = self.stream.charsUntil(("&", "<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def entityDataState(self): self.consumeEntity() self.state = self.dataState return True def rcdataState(self): data = self.stream.char() if data == "&": self.state = self.characterReferenceInRcdata elif data == "<": self.state = self.rcdataLessThanSignState elif data == EOF: # Tokenization ends. return False elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data in spaceCharacters: # Directly after emitting a token you switch back to the "data # state". At that point spaceCharacters are important so they are # emitted separately. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": data + self.stream.charsUntil(spaceCharacters, True)}) # No need to update lastFourChars here, since the first space will # have already been appended to lastFourChars and will have broken # any sequences else: chars = self.stream.charsUntil(("&", "<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def characterReferenceInRcdata(self): self.consumeEntity() self.state = self.rcdataState return True def rawtextState(self): data = self.stream.char() if data == "<": self.state = self.rawtextLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data == EOF: # Tokenization ends. return False else: chars = self.stream.charsUntil(("<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def scriptDataState(self): data = self.stream.char() if data == "<": self.state = self.scriptDataLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data == EOF: # Tokenization ends. return False else: chars = self.stream.charsUntil(("<", "\u0000")) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + chars}) return True def plaintextState(self): data = self.stream.char() if data == EOF: # Tokenization ends. return False elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data + self.stream.charsUntil("\u0000")}) return True def tagOpenState(self): data = self.stream.char() if data == "!": self.state = self.markupDeclarationOpenState elif data == "/": self.state = self.closeTagOpenState elif data in asciiLetters: self.currentToken = {"type": tokenTypes["StartTag"], "name": data, "data": [], "selfClosing": False, "selfClosingAcknowledged": False} self.state = self.tagNameState elif data == ">": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-right-bracket"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) self.state = self.dataState elif data == "?": # XXX In theory it could be something besides a tag name. But # do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-question-mark"}) self.stream.unget(data) self.state = self.bogusCommentState else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.stream.unget(data) self.state = self.dataState return True def closeTagOpenState(self): data = self.stream.char() if data in asciiLetters: self.currentToken = {"type": tokenTypes["EndTag"], "name": data, "data": [], "selfClosing": False} self.state = self.tagNameState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-right-bracket"}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-closing-tag-but-got-eof"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "": self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-tag-name"}) self.state = self.dataState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["name"] += "\uFFFD" else: self.currentToken["name"] += data # (Don't use charsUntil here, because tag names are # very short and it's faster to not do anything fancy) return True def rcdataLessThanSignState(self): data = self.stream.char() if data == "/": self.temporaryBuffer = "" self.state = self.rcdataEndTagOpenState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.stream.unget(data) self.state = self.rcdataState return True def rcdataEndTagOpenState(self): data = self.stream.char() if data in asciiLetters: self.temporaryBuffer += data self.state = self.rcdataEndTagNameState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) self.state = self.scriptDataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) self.state = self.scriptDataEscapedState elif data == EOF: self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.scriptDataEscapedState return True def scriptDataEscapedLessThanSignState(self): data = self.stream.char() if data == "/": self.temporaryBuffer = "" self.state = self.scriptDataEscapedEndTagOpenState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) self.temporaryBuffer = data self.state = self.scriptDataDoubleEscapeStartState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.stream.unget(data) self.state = self.scriptDataEscapedState return True def scriptDataEscapedEndTagOpenState(self): data = self.stream.char() if data in asciiLetters: self.temporaryBuffer = data self.state = self.scriptDataEscapedEndTagNameState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "" and appropriate: self.currentToken = {"type": tokenTypes["EndTag"], "name": self.temporaryBuffer, "data": [], "selfClosing": False} self.emitCurrentToken() self.state = self.dataState elif data in asciiLetters: self.temporaryBuffer += data else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ""))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataDoubleEscapedState else: self.state = self.scriptDataEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.temporaryBuffer += data else: self.stream.unget(data) self.state = self.scriptDataEscapedState return True def scriptDataDoubleEscapedState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) self.state = self.scriptDataDoubleEscapedDashState elif data == "<": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-script-in-script"}) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) return True def scriptDataDoubleEscapedDashState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) self.state = self.scriptDataDoubleEscapedDashDashState elif data == "<": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) self.state = self.scriptDataDoubleEscapedState elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-script-in-script"}) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapedDashDashState(self): data = self.stream.char() if data == "-": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) elif data == "<": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) self.state = self.scriptDataDoubleEscapedLessThanSignState elif data == ">": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) self.state = self.scriptDataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "\uFFFD"}) self.state = self.scriptDataDoubleEscapedState elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-script-in-script"}) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapedLessThanSignState(self): data = self.stream.char() if data == "/": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) self.temporaryBuffer = "" self.state = self.scriptDataDoubleEscapeEndState else: self.stream.unget(data) self.state = self.scriptDataDoubleEscapedState return True def scriptDataDoubleEscapeEndState(self): data = self.stream.char() if data in (spaceCharacters | frozenset(("/", ">"))): self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) if self.temporaryBuffer.lower() == "script": self.state = self.scriptDataEscapedState else: self.state = self.scriptDataDoubleEscapedState elif data in asciiLetters: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.temporaryBuffer += data else: self.stream.unget(data) self.state = self.scriptDataDoubleEscapedState return True def beforeAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == ">": self.emitCurrentToken() elif data == "/": self.state = self.selfClosingStartTagState elif data in ("'", '"', "=", "<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"].append(["\uFFFD", ""]) self.state = self.attributeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-name-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState return True def attributeNameState(self): data = self.stream.char() leavingThisState = True emitToken = False if data == "=": self.state = self.beforeAttributeValueState elif data in asciiLetters: self.currentToken["data"][-1][0] += data +\ self.stream.charsUntil(asciiLetters, True) leavingThisState = False elif data == ">": # XXX If we emit here the attributes are converted to a dict # without being checked and when the code below runs we error # because data is a dict not a list emitToken = True elif data in spaceCharacters: self.state = self.afterAttributeNameState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][0] += "\uFFFD" leavingThisState = False elif data in ("'", '"', "<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-in-attribute-name"}) self.currentToken["data"][-1][0] += data leavingThisState = False elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-name"}) self.state = self.dataState else: self.currentToken["data"][-1][0] += data leavingThisState = False if leavingThisState: # Attributes are not dropped at this stage. That happens when the # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "duplicate-attribute"}) break # XXX Fix for above XXX if emitToken: self.emitCurrentToken() return True def afterAttributeNameState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == "=": self.state = self.beforeAttributeValueState elif data == ">": self.emitCurrentToken() elif data in asciiLetters: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data == "/": self.state = self.selfClosingStartTagState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"].append(["\uFFFD", ""]) self.state = self.attributeNameState elif data in ("'", '"', "<"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-character-after-attribute-name"}) self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-end-of-tag-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"].append([data, ""]) self.state = self.attributeNameState return True def beforeAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.stream.charsUntil(spaceCharacters, True) elif data == "\"": self.state = self.attributeValueDoubleQuotedState elif data == "&": self.state = self.attributeValueUnQuotedState self.stream.unget(data) elif data == "'": self.state = self.attributeValueSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-right-bracket"}) self.emitCurrentToken() elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" self.state = self.attributeValueUnQuotedState elif data in ("=", "<", "`"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "equals-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data self.state = self.attributeValueUnQuotedState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-attribute-value-but-got-eof"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data self.state = self.attributeValueUnQuotedState return True def attributeValueDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterAttributeValueState elif data == "&": self.processEntityInAttribute('"') elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-double-quote"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("\"", "&", "\u0000")) return True def attributeValueSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterAttributeValueState elif data == "&": self.processEntityInAttribute("'") elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-single-quote"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data +\ self.stream.charsUntil(("'", "&", "\u0000")) return True def attributeValueUnQuotedState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == "&": self.processEntityInAttribute(">") elif data == ">": self.emitCurrentToken() elif data in ('"', "'", "=", "<", "`"): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-in-unquoted-attribute-value"}) self.currentToken["data"][-1][1] += data elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"][-1][1] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-attribute-value-no-quotes"}) self.state = self.dataState else: self.currentToken["data"][-1][1] += data + self.stream.charsUntil( frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) return True def afterAttributeValueState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeAttributeNameState elif data == ">": self.emitCurrentToken() elif data == "/": self.state = self.selfClosingStartTagState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-attribute-value"}) self.stream.unget(data) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-attribute-value"}) self.stream.unget(data) self.state = self.beforeAttributeNameState return True def selfClosingStartTagState(self): data = self.stream.char() if data == ">": self.currentToken["selfClosing"] = True self.emitCurrentToken() elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-EOF-after-solidus-in-tag"}) self.stream.unget(data) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-character-after-solidus-in-tag"}) self.stream.unget(data) self.state = self.beforeAttributeNameState return True def bogusCommentState(self): # Make a new comment token and give it as value all the characters # until the first > or EOF (charsUntil checks for EOF automatically) # and emit it. data = self.stream.charsUntil(">") data = data.replace("\u0000", "\uFFFD") self.tokenQueue.append( {"type": tokenTypes["Comment"], "data": data}) # Eat the character directly after the bogus comment which is either a # ">" or an EOF. self.stream.char() self.state = self.dataState return True def markupDeclarationOpenState(self): charStack = [self.stream.char()] if charStack[-1] == "-": charStack.append(self.stream.char()) if charStack[-1] == "-": self.currentToken = {"type": tokenTypes["Comment"], "data": ""} self.state = self.commentStartState return True elif charStack[-1] in ('d', 'D'): matched = True for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), ('y', 'Y'), ('p', 'P'), ('e', 'E')): charStack.append(self.stream.char()) if charStack[-1] not in expected: matched = False break if matched: self.currentToken = {"type": tokenTypes["Doctype"], "name": "", "publicId": None, "systemId": None, "correct": True} self.state = self.doctypeState return True elif (charStack[-1] == "[" and self.parser is not None and self.parser.tree.openElements and self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): matched = True for expected in ["C", "D", "A", "T", "A", "["]: charStack.append(self.stream.char()) if charStack[-1] != expected: matched = False break if matched: self.state = self.cdataSectionState return True self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-dashes-or-doctype"}) while charStack: self.stream.unget(charStack.pop()) self.state = self.bogusCommentState return True def commentStartState(self): data = self.stream.char() if data == "-": self.state = self.commentStartDashState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data self.state = self.commentState return True def commentStartDashState(self): data = self.stream.char() if data == "-": self.state = self.commentEndState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "-\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "incorrect-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "-" + data self.state = self.commentState return True def commentState(self): data = self.stream.char() if data == "-": self.state = self.commentEndDashState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "\uFFFD" elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += data + \ self.stream.charsUntil(("-", "\u0000")) return True def commentEndDashState(self): data = self.stream.char() if data == "-": self.state = self.commentEndState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "-\uFFFD" self.state = self.commentState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "-" + data self.state = self.commentState return True def commentEndState(self): data = self.stream.char() if data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "--\uFFFD" self.state = self.commentState elif data == "!": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-bang-after-double-dash-in-comment"}) self.state = self.commentEndBangState elif data == "-": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-dash-after-double-dash-in-comment"}) self.currentToken["data"] += data elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-double-dash"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: # XXX self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-comment"}) self.currentToken["data"] += "--" + data self.state = self.commentState return True def commentEndBangState(self): data = self.stream.char() if data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "-": self.currentToken["data"] += "--!" self.state = self.commentEndDashState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["data"] += "--!\uFFFD" self.state = self.commentState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-comment-end-bang-state"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["data"] += "--!" + data self.state = self.commentState return True def doctypeState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "need-space-after-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypeNameState return True def beforeDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-right-bracket"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["name"] = "\uFFFD" self.state = self.doctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-doctype-name-but-got-eof"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["name"] = data self.state = self.doctypeNameState return True def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.state = self.afterDoctypeNameState elif data == ">": self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["name"] += "\uFFFD" self.state = self.doctypeNameState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["name"] += data return True def afterDoctypeNameState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.currentToken["correct"] = False self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: if data in ("p", "P"): matched = True for expected in (("u", "U"), ("b", "B"), ("l", "L"), ("i", "I"), ("c", "C")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.afterDoctypePublicKeywordState return True elif data in ("s", "S"): matched = True for expected in (("y", "Y"), ("s", "S"), ("t", "T"), ("e", "E"), ("m", "M")): data = self.stream.char() if data not in expected: matched = False break if matched: self.state = self.afterDoctypeSystemKeywordState return True # All the characters read before the current 'data' will be # [a-zA-Z], so they're garbage in the bogus doctype and can be # discarded; only the latest character might be '>' or EOF # and needs to be ungetted self.stream.unget(data) self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-space-or-right-bracket-in-doctype", "datavars": {"data": data}}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def afterDoctypePublicKeywordState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypePublicIdentifierState elif data in ("'", '"'): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypePublicIdentifierState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.stream.unget(data) self.state = self.beforeDoctypePublicIdentifierState return True def beforeDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["publicId"] = "" self.state = self.doctypePublicIdentifierDoubleQuotedState elif data == "'": self.currentToken["publicId"] = "" self.state = self.doctypePublicIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def doctypePublicIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterDoctypePublicIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["publicId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["publicId"] += data return True def doctypePublicIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterDoctypePublicIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["publicId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["publicId"] += data return True def afterDoctypePublicIdentifierState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.betweenDoctypePublicAndSystemIdentifiersState elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == '"': self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def betweenDoctypePublicAndSystemIdentifiersState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == '"': self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def afterDoctypeSystemKeywordState(self): data = self.stream.char() if data in spaceCharacters: self.state = self.beforeDoctypeSystemIdentifierState elif data in ("'", '"'): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.stream.unget(data) self.state = self.beforeDoctypeSystemIdentifierState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.stream.unget(data) self.state = self.beforeDoctypeSystemIdentifierState return True def beforeDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == "\"": self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierDoubleQuotedState elif data == "'": self.currentToken["systemId"] = "" self.state = self.doctypeSystemIdentifierSingleQuotedState elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.currentToken["correct"] = False self.state = self.bogusDoctypeState return True def doctypeSystemIdentifierDoubleQuotedState(self): data = self.stream.char() if data == "\"": self.state = self.afterDoctypeSystemIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["systemId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["systemId"] += data return True def doctypeSystemIdentifierSingleQuotedState(self): data = self.stream.char() if data == "'": self.state = self.afterDoctypeSystemIdentifierState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) self.currentToken["systemId"] += "\uFFFD" elif data == ">": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-end-of-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.currentToken["systemId"] += data return True def afterDoctypeSystemIdentifierState(self): data = self.stream.char() if data in spaceCharacters: pass elif data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype"}) self.currentToken["correct"] = False self.tokenQueue.append(self.currentToken) self.state = self.dataState else: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "unexpected-char-in-doctype"}) self.state = self.bogusDoctypeState return True def bogusDoctypeState(self): data = self.stream.char() if data == ">": self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data is EOF: # XXX EMIT self.stream.unget(data) self.tokenQueue.append(self.currentToken) self.state = self.dataState else: pass return True def cdataSectionState(self): data = [] while True: data.append(self.stream.charsUntil("]")) data.append(self.stream.charsUntil(">")) char = self.stream.char() if char == EOF: break else: assert char == ">" if data[-1][-2:] == "]]": data[-1] = data[-1][:-2] break else: data.append(char) data = "".join(data) # pylint:disable=redefined-variable-type # Deal with null here rather than in the parser nullCount = data.count("\u0000") if nullCount > 0: for _ in range(nullCount): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) data = data.replace("\u0000", "\uFFFD") if data: self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) self.state = self.dataState return True