awesome-python
diff --git a/‎src/tokenizer.py
Lines changed: 16 additions & 17 deletions b/‎src/tokenizer.py
Lines changed: 16 additions & 17 deletions
diff --git a/‎tests/test_sanitizer.py
Lines changed: 2 additions & 2 deletions b/‎tests/test_sanitizer.py
Lines changed: 2 additions & 2 deletions
@@ -129,7 +129,7 @@ def consumeNumberEntity(self, isHex):
         if isHex:
             allowed = hexDigits
             radix = 16
-
+
         char = u"\uFFFD"
         charStack = []
 
@@ -146,25 +146,24 @@ def consumeNumberEntity(self, isHex):
         # If the integer is between 127 and 160 (so 128 and bigger and 159 and
         # smaller) we need to do the "windows trick".
         if 127 < charAsInt < 160:
-            #XXX - removed parse error from windows 1252 entity for now
-            #we may want to reenable this later
-            #self.tokenQueue.append({"type": "ParseError", "data":
-            #  _("Entity used with illegal number (windows-1252 reference).")})
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Entity used with illegal number (windows-1252 reference).")})
 
             charAsInt = entitiesWindows1252[charAsInt - 128]
 
-        # 0 is not a good number.
-        if charAsInt == 0:
-            charAsInt = 65533
-
-        try:
-            # XXX We should have a separate function that does "int" to
-            # "unicodestring" conversion since this doesn't always work
-            # according to hsivonen. Also, unichr has a limitation of 65535
-            char = unichr(charAsInt)
-        except:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Numeric entity couldn't be converted to character.")})
+        # 0 is not a good number, neither are illegal Unicode code points.
+        if charAsInt > 0 and charAsInt <= 1114111:
+            try:
+                # XXX We should have a separate function that does "int" to
+                # "unicodestring" conversion since this doesn't always work
+                # according to hsivonen. Also, unichr has a limitation of 65535
+                char = unichr(charAsInt)
+            except:
+                try:
+                    char = eval("u'\\U%08x'" % charAsInt)
+                except:
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Numeric entity couldn't be converted to character.")})
 
         # Discard the ; if present. Otherwise, put it back on the queue and
         # invoke parseError on parser.
 
@@ -13,7 +13,7 @@ class SanitizeTest(unittest.TestCase):
   def addTest(cls, name, expected, input):
     def test(self, expected=expected, input=input):
         expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
-          parseFragment(expected.encode('utf-8')).childNodes])
+          parseFragment(expected).childNodes])
         self.assertEqual(expected, self.sanitize_html(input))
     setattr(cls, name, test)
   addTest = classmethod(addTest)
@@ -23,7 +23,7 @@ def sanitize_html(self,stream):
        html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
            parseFragment(stream).childNodes])
 
-  def test_should_handle_astral_plane_characters(self):
+  def test_should_handle_astral_plane_characters(self):
     self.assertEqual(u"<p>\U0001d4b5 \U0001d538</p>",
       self.sanitize_html("<p>&#x1d4b5; &#x1d538;</p>"))