diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py index 4748a197..dd2a7a49 100644 --- a/html5lib/_tokenizer.py +++ b/html5lib/_tokenizer.py @@ -7,7 +7,7 @@ from .constants import spaceCharacters from .constants import entities -from .constants import asciiLetters, asciiUpper2Lower +from .constants import asciiLetters from .constants import digits, hexDigits, EOF from .constants import tokenTypes, tagTokenTypes from .constants import replacementCharacters @@ -233,7 +233,7 @@ def emitCurrentToken(self): token = self.currentToken # Add token to the queue to be yielded if (token["type"] in tagTokenTypes): - token["name"] = token["name"].translate(asciiUpper2Lower) + token["name"] = token["name"].lower() if token["type"] == tokenTypes["StartTag"]: raw = token["data"] data = attributeMap(raw) @@ -927,7 +927,7 @@ def attributeNameState(self): # start tag token is emitted so values can still be safely appended # to attributes, but we do want to report the parse error in time. self.currentToken["data"][-1][0] = ( - self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) + self.currentToken["data"][-1][0].lower()) for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -1348,10 +1348,10 @@ def beforeDoctypeNameState(self): def doctypeNameState(self): data = self.stream.char() if data in spaceCharacters: - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.currentToken["name"] = self.currentToken["name"].lower() self.state = self.afterDoctypeNameState elif data == ">": - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.currentToken["name"] = self.currentToken["name"].lower() self.tokenQueue.append(self.currentToken) self.state = self.dataState elif data == "\u0000": @@ -1363,7 +1363,7 @@ def doctypeNameState(self): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-doctype-name"}) self.currentToken["correct"] = False - self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) + self.currentToken["name"] = self.currentToken["name"].lower() self.tokenQueue.append(self.currentToken) self.state = self.dataState else: diff --git a/html5lib/constants.py b/html5lib/constants.py index fe3e237c..d6f8cef1 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -538,14 +538,11 @@ "tr" ]) -asciiLowercase = frozenset(string.ascii_lowercase) asciiUppercase = frozenset(string.ascii_uppercase) asciiLetters = frozenset(string.ascii_letters) digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) -asciiUpper2Lower = {ord(c): ord(c.lower()) for c in string.ascii_uppercase} - # Heading elements need to be ordered headingElements = ( "h1", diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 74d829d9..8967f1e2 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -11,7 +11,7 @@ from . import _utils from .constants import ( - spaceCharacters, asciiUpper2Lower, + spaceCharacters, specialElements, headingElements, cdataElements, rcdataElements, tokenTypes, tagTokenTypes, namespaces, @@ -183,8 +183,7 @@ def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): return ("encoding" in element.attributes and - element.attributes["encoding"].translate( - asciiUpper2Lower) in + element.attributes["encoding"].lower() in ("text/html", "application/xhtml+xml")) else: return (element.namespace, element.name) in htmlIntegrationPointElements @@ -520,7 +519,7 @@ def processDoctype(self, token): self.tree.insertDoctype(token) if publicId != "": - publicId = publicId.translate(asciiUpper2Lower) + publicId = publicId.lower() if (not correct or token["name"] != "html" or publicId.startswith( @@ -1165,7 +1164,7 @@ def startTagInput(self, token): framesetOK = self.parser.framesetOK self.startTagVoidFormatting(token) if ("type" in token["data"] and - token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + token["data"]["type"].lower() == "hidden"): # input type=hidden doesn't change framesetOK self.parser.framesetOK = framesetOK @@ -1771,7 +1770,7 @@ def startTagStyleScript(self, token): def startTagInput(self, token): if ("type" in token["data"] and - token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + token["data"]["type"].lower() == "hidden"): self.parser.parseError("unexpected-hidden-input-in-table") self.tree.insertElement(token) # XXX associate with form @@ -2512,11 +2511,11 @@ def processStartTag(self, token): def processEndTag(self, token): nodeIndex = len(self.tree.openElements) - 1 node = self.tree.openElements[-1] - if node.name.translate(asciiUpper2Lower) != token["name"]: + if node.name.lower() != token["name"]: self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) while True: - if node.name.translate(asciiUpper2Lower) == token["name"]: + if node.name.lower() == token["name"]: # XXX this isn't in the spec but it seems necessary if self.parser.phase == self.parser.phases["inTableText"]: self.parser.phase.flushCharacters()