8000 Slight refactoring of fragment parsing · awesome-python/html5lib-python@7e25bda · GitHub
[go: up one dir, main page]

Skip to content

Commit 7e25bda

Browse files
committed
Slight refactoring of fragment parsing
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40543
1 parent 3ba4136 commit 7e25bda

File tree

3 files changed

+48
-73
lines changed

3 files changed

+48
-73
lines changed

src/html5parser.py

Lines changed: 39 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -71,32 +71,40 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
7171
"trailingEnd": TrailingEndPhase(self, self.tree)
7272
}
7373

74-
def parse(self, stream, encoding=None):
75-
"""Parse a HTML document into a well-formed tree
76-
77-
stream - a filelike object or string containing the HTML to be parsed
78-
79-
The optional encoding parameter must be a string that indicates
80-
the encoding. If specified, that encoding will be used,
81-
regardless of any BOM or later declaration (such as in a meta
82-
element)
83-
"""
84-
74+
def _parse(self, stream, innerHTML=False, container="div",
75+
encoding=None):
76+
8577
self.tree.reset()
8678
self.firstStartTag = False
8779
self.errors = []
8880

89-
self.phase = self.phases["initial"]
81+
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
82+
parseMeta=innerHTML)
83+
84+
if innerHTML:
85+
self.innerHTML = container.lower()
86+
87+
if self.innerHTML in ('title', 'textarea'):
88+
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
89+
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
90+
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
91+
elif self.innerHTML == 'plaintext':
92+
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
93+
else:
94+
# contentModelFlag already is PCDATA
95+
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
96+
pass
97+
self.phase = self.phases["rootElement"]
98+
self.phase.insertHtmlElement()
99+
self.resetInsertionMode()
100+
else:
101+
self.innerHTML = False
102+
self.phase = self.phases["initial"]
103+
90104
# We only seem to have InBodyPhase testcases where the following is
91105
# relevant ... need others too
92106
self.lastPhase = None
93107

94-
# We don't actually support innerHTML yet but this should allow
95-
# assertations
96-
self.innerHTML = False
97-
98-
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
99-
100108
# XXX This is temporary for the moment so there isn't any other
101109
# changes needed for the parser to work with the iterable tokenizer
102110
for token in self.tokenizer:
@@ -115,9 +123,20 @@ def parse(self, stream, encoding=None):
115123
# When the loop finishes it's EOF
116124
self.phase.processEOF()
117125

126+
def parse(self, stream, encoding=None):
127+
"""Parse a HTML document into a well-formed tree
128+
129+
stream - a filelike object or string containing the HTML to be parsed
130+
131+
The optional encoding parameter must be a string that indicates
132+
the encoding. If specified, that encoding will be used,
133+
regardless of any BOM or later declaration (such as in a meta
134+
element)
135+
"""
136+
self._parse(stream, innerHTML=False, encoding=encoding)
118137
return self.tree.getDocument()
119138

120-
def parseFragment(self, stream, container=None, encoding=None):
139+
def parseFragment(self, stream, container="div", encoding=None):
121140
"""Parse a HTML fragment into a well-formed tree fragment
122141
123142
container - name of the element we're setting the innerHTML property
@@ -130,50 +149,7 @@ def parseFragment(self, stream, container=None, encoding=None):
130149
regardless of any BOM or later declaration (such as in a meta
131150
element)
132151
"""
133-
134-
self.tree.reset()
135-
self.firstStartTag = False
136-
self.errors = []
137-
138-
self.innerHTML = container and container.lower() or 'div'
139-
140-
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
141-
if self.innerHTML in ('title', 'textarea'):
142-
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
143-
elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
144-
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
145-
elif self.innerHTML == 'plaintext':
146-
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
147-
else:
148-
# contentModelFlag already is PCDATA
149-
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
150-
pass
151-
152-
self.phase = self.phases["rootElement"]
153-
self.phase.insertHtmlElement()
154-
self.resetInsertionMode()
155-
# We only seem to have InBodyPhase testcases where the following is
156-
# relevant ... need others too
157-
self.lastPhase = None
158-
159-
# XXX This is temporary for the moment so there isn't any other
160-
# changes needed for the parser to work with the iterable tokenizer
161-
for token in self.tokenizer:
162-
token = self.normalizeToken(token)
163-
type = token["type"]
164-
method = getattr(self.phase, "process%s" % type, None)
165-
if type in ("Characters", "SpaceCharacters", "Comment"):
166-
method(token["data"])
167-
elif type in ("StartTag", "Doctype"):
168-
method(token["name"], token["data"])
169-
elif type == "EndTag":
170-
method(token["name"])
171-
else:
172-
self.parseError(token["data"])
173-
174-
# When the loop finishes it's EOF
175-
self.phase.processEOF()
176-
152+
self._parse(stream, True, container=container, encoding=encoding)
177153
return self.tree.getFragment()
178154

179155
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):

src/inputstream.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class HTMLInputStream(object):
1414
1515
"""
1616

17-
def __init__(self, source, encoding=None, chardet=True):
17+
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
1818
"""Initialises the HTMLInputStream.
1919
2020
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -26,6 +26,8 @@ def __init__(self, source, encoding=None, chardet=True):
2626
the encoding. If specified, that encoding will be used,
2727
regardless of any BOM or later declaration (such as in a meta
2828
element)
29+
30+
parseMeta - Look for a <meta> element containing encoding information
2931
3032
"""
3133
# List of where new lines occur
@@ -41,12 +43,9 @@ def __init__(self, source, encoding=None, chardet=True):
4143
#Encoding to use if no other information can be found
4244
self.defaultEncoding = "windows-1252"
4345

44-
#Autodetect encoding if no other information can be found?
45-
self.chardet = chardet
46-
4746
#Detect encoding iff no explicit "transport level" encoding is supplied
4847
if encoding is None or not isValidEncoding(encoding):
49-
encoding = self.detectEncoding()
48+
encoding = self.detectEncoding(parseMeta, chardet)
5049
self.charEncoding = encoding
5150

5251
# Read bytes from stream decoding them into Unicode
@@ -79,17 +78,17 @@ def openStream(self, source):
7978
stream = cStringIO.StringIO(str(source))
8079
return stream
8180

82-
def detectEncoding(self):
81+
def detectEncoding(self, parseMeta=True, chardet=True):
8382

8483
#First look for a BOM
8584
#This will also read past the BOM if present
8685
encoding = self.detectBOM()
8786
#If there is no BOM need to look for meta elements with encoding
8887
#information
89-
if encoding is None:
88+
if encoding is None and parseMeta:
9089
encoding = self.detectEncodingMeta()
9190
#Guess with chardet, if avaliable
92-
if encoding is None and self.chardet:
91+
if encoding is None and chardet:
9392
try:
9493
import chardet
9594
buffer = self.rawStream.read()

src/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ class HTMLTokenizer(object):
3232

3333
# XXX need to fix documentation
3434

35-
def __init__(self, stream, encoding=None):
36-
self.stream = HTMLInputStream(stream, encoding)
35+
def __init__(self, stream, encoding=None, parseMeta=True):
36+
self.stream = HTMLInputStream(stream, encoding, parseMeta)
3737

3838
self.states = {
3939
"data":self.dataState,

0 commit comments

Comments
 (0)
0