awesome-python
diff --git a/‎src/html5parser.py
Lines changed: 39 additions & 63 deletions b/‎src/html5parser.py
Lines changed: 39 additions & 63 deletions
diff --git a/‎src/inputstream.py
Lines changed: 7 additions & 8 deletions b/‎src/inputstream.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎src/tokenizer.py
Lines changed: 2 additions & 2 deletions b/‎src/tokenizer.py
Lines changed: 2 additions & 2 deletions
@@ -71,32 +71,40 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
             "trailingEnd": TrailingEndPhase(self, self.tree)
         }
 
-    def parse(self, stream, encoding=None):
-        """Parse a HTML document into a well-formed tree
-
-        stream - a filelike object or string containing the HTML to be parsed
-
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
-        """
-
+    def _parse(self, stream, innerHTML=False, container="div",
+               encoding=None):
+        
         self.tree.reset()
         self.firstStartTag = False
         self.errors = []
 
-        self.phase = self.phases["initial"]
+        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
+                                                 parseMeta=innerHTML)
+
+        if innerHTML:
+            self.innerHTML = container.lower()
+
+            if self.innerHTML in ('title', 'textarea'):
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
+            elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
+            elif self.innerHTML == 'plaintext':
+                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
+            else:
+                # contentModelFlag already is PCDATA
+                #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
+                pass
+            self.phase = self.phases["rootElement"]
+            self.phase.insertHtmlElement()
+            self.resetInsertionMode()
+        else:
+            self.innerHTML = False
+            self.phase = self.phases["initial"]
+
         # We only seem to have InBodyPhase testcases where the following is
         # relevant ... need others too
         self.lastPhase = None
 
-        # We don't actually support innerHTML yet but this should allow
-        # assertations
-        self.innerHTML = False
-
-        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
-
         # XXX This is temporary for the moment so there isn't any other
         # changes needed for the parser to work with the iterable tokenizer
         for token in self.tokenizer:
@@ -115,9 +123,20 @@ def parse(self, stream, encoding=None):
         # When the loop finishes it's EOF
         self.phase.processEOF()
 
+    def parse(self, stream, encoding=None):
+        """Parse a HTML document into a well-formed tree
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        """
+        self._parse(stream, innerHTML=False, encoding=encoding)
         return self.tree.getDocument()
 
-    def parseFragment(self, stream, container=None, encoding=None):
+    def parseFragment(self, stream, container="div", encoding=None):
         """Parse a HTML fragment into a well-formed tree fragment
         
         container - name of the element we're setting the innerHTML property
@@ -130,50 +149,7 @@ def parseFragment(self, stream, container=None, encoding=None):
         regardless of any BOM or later declaration (such as in a meta
         element)
         """
-
-        self.tree.reset()
-        self.firstStartTag = False
-        self.errors = []
-
-        self.innerHTML = container and container.lower() or 'div'
-
-        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
-        if self.innerHTML in ('title', 'textarea'):
-            self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
-        elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
-            self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
-        elif self.innerHTML == 'plaintext':
-            self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
-        else:
-            # contentModelFlag already is PCDATA
-            #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
-            pass
-
-        self.phase = self.phases["rootElement"]
-        self.phase.insertHtmlElement()
-        self.resetInsertionMode()
-        # We only seem to have InBodyPhase testcases where the following is
-        # relevant ... need others too
-        self.lastPhase = None
-
-        # XXX This is temporary for the moment so there isn't any other
-        # changes needed for the parser to work with the iterable tokenizer
-        for token in self.tokenizer:
-            token = self.normalizeToken(token)
-            type = token["type"]
-            method = getattr(self.phase, "process%s" % type, None)
-            if type in ("Characters", "SpaceCharacters", "Comment"):
-                method(token["data"])
-            elif type in ("StartTag", "Doctype"):
-                method(token["name"], token["data"])
-            elif type == "EndTag":
-                method(token["name"])
-            else:
-                self.parseError(token["data"])
-
-        # When the loop finishes it's EOF
-        self.phase.processEOF()
-
+        self._parse(stream, True, container=container, encoding=encoding)
         return self.tree.getFragment()
 
     def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
 
@@ -14,7 +14,7 @@ class HTMLInputStream(object):
 
     """
 
-    def __init__(self, source, encoding=None, chardet=True):
+    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -26,6 +26,8 @@ def __init__(self, source, encoding=None, chardet=True):
         the encoding.  If specified, that encoding will be used,
         regardless of any BOM or later declaration (such as in a meta
         element)
+        
+        parseMeta - Look for a <meta> element containing encoding information
 
         """
         # List of where new lines occur
@@ -41,12 +43,9 @@ def __init__(self, source, encoding=None, chardet=True):
         #Encoding to use if no other information can be found
         self.defaultEncoding = "windows-1252"
 
-        #Autodetect encoding if no other information can be found?
-        self.chardet = chardet
-        
         #Detect encoding iff no explicit "transport level" encoding is supplied
         if encoding is None or not isValidEncoding(encoding):
-            encoding = self.detectEncoding()
+            encoding = self.detectEncoding(parseMeta, chardet)
         self.charEncoding = encoding
 
         # Read bytes from stream decoding them into Unicode
@@ -79,17 +78,17 @@ def openStream(self, source):
             stream = cStringIO.StringIO(str(source))
         return stream
 
-    def detectEncoding(self):
+    def detectEncoding(self, parseMeta=True, chardet=True):
 
         #First look for a BOM
         #This will also read past the BOM if present
         encoding = self.detectBOM()
         #If there is no BOM need to look for meta elements with encoding 
         #information
-        if encoding is None:
+        if encoding is None and parseMeta:
             encoding = self.detectEncodingMeta()
         #Guess with chardet, if avaliable
-        if encoding is None and self.chardet:
+        if encoding is None and chardet:
             try:
                 import chardet
                 buffer = self.rawStream.read()
 
@@ -32,8 +32,8 @@ class HTMLTokenizer(object):
 
     # XXX need to fix documentation
 
-    def __init__(self, stream, encoding=None):
-        self.stream = HTMLInputStream(stream, encoding)
+        self.stream = HTMLInputStream(stream, encoding, parseMeta)
 
         self.states = {
             "data":self.dataState,