31
31
piclose = re .compile ('>' )
32
32
commentclose = re .compile (r'--\s*>' )
33
33
# Note:
34
- # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
35
- # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34
+ # 1) if you change tagfind/attrfind remember to update locatetagend too;
35
+ # 2) if you change tagfind/attrfind and/or locatetagend the parser will
36
36
# explode, so don't do it.
37
- # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38
- # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
39
- tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*' )
40
- attrfind_tolerant = re .compile (
41
- r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
42
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
37
+ # see the HTML5 specs section "13.2.5.6 Tag open state",
38
+ # "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41
+ # https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
42
+ tagfind_tolerant = re .compile (r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*' )
43
+ attrfind_tolerant = re .compile (r"""
44
+ (
45
+ (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
46
+ )
47
+ (= # value indicator
48
+ ('[^']*' # LITA-enclosed value
49
+ |"[^"]*" # LIT-enclosed value
50
+ |(?!['"])[^>\t\n\r\f ]* # bare value
51
+ )
52
+ )?
53
+ (?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
54
+ """ , re .VERBOSE )
55
+ locatetagend = re .compile (r"""
56
+ [a-zA-Z][^\t\n\r\f />]* # tag name
57
+ [\t\n\r\f /]* # optional whitespace before attribute name
58
+ (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
59
+ (?:= # value indicator
60
+ (?:'[^']*' # LITA-enclosed value
61
+ |"[^"]*" # LIT-enclosed value
62
+ |(?!['"])[^>\t\n\r\f ]* # bare value
63
+ )
64
+ )?
65
+ [\t\n\r\f /]* # possibly followed by a space
66
+ )*
67
+ >?
68
+ """ , re .VERBOSE )
69
+ # The following variables are not used, but are temporarily left for
70
+ # backward compatibility.
43
71
locatestarttagend_tolerant = re .compile (r"""
44
72
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
45
73
(?:[\s/]* # optional whitespace before attribute name
56
84
\s* # trailing whitespace
57
85
""" , re .VERBOSE )
58
86
endendtag = re .compile ('>' )
59
- # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
60
- # </ and the tag name, so maybe this should be fixed
61
87
endtagfind = re .compile (r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
62
88
63
89
# Character reference processing logic specific to attribute values
@@ -141,7 +167,8 @@ def get_starttag_text(self):
141
167
142
168
def set_cdata_mode (self , elem ):
143
169
self .cdata_elem = elem .lower ()
144
- self .interesting = re .compile (r'</\s*%s\s*>' % self .cdata_elem , re .I )
170
+ self .interesting = re .compile (r'</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
171
+ re .IGNORECASE | re .ASCII )
145
172
146
173
def clear_cdata_mode (self ):
147
174
self .interesting = interesting_normal
@@ -166,7 +193,7 @@ def goahead(self, end):
166
193
# & near the end and see if it's followed by a space or ;.
167
194
amppos = rawdata .rfind ('&' , max (i , n - 34 ))
168
195
if (amppos >= 0 and
169
- not re .compile (r'[\s ;]' ).search (rawdata , amppos )):
196
+ not re .compile (r'[\t\n\r\f ;]' ).search (rawdata , amppos )):
170
197
break # wait till we get all the text
171
198
j = n
172
199
else :
@@ -310,7 +337,7 @@ def parse_html_declaration(self, i):
310
337
return self .parse_bogus_comment (i )
311
338
312
339
# Internal -- parse bogus comment, return length or -1 if not terminated
313
- # see http ://www.w3. org/TR/html5/tokenization .html#bogus-comment-state
340
+ # see https ://html.spec.whatwg. org/multipage/parsing .html#bogus-comment-state
314
341
def parse_bogus_comment (self , i , report = 1 ):
315
342
rawdata = self .rawdata
316
343
assert rawdata [i :i + 2 ] in ('<!' , '</' ), ('unexpected call to '
@@ -336,6 +363,8 @@ def parse_pi(self, i):
336
363
337
364
# Internal -- handle starttag, return end or -1 if not terminated
338
365
def parse_starttag (self , i ):
366
+ # See the HTML5 specs section "13.2.5.8 Tag name state"
367
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
339
368
self .__starttag_text = None
340
369
endpos = self .check_for_whole_start_tag (i )
341
370
if endpos < 0 :
@@ -381,76 +410,42 @@ def parse_starttag(self, i):
381
410
# or -1 if incomplete.
382
411
def check_for_whole_start_tag (self , i ):
383
412
rawdata = self .rawdata
384
- m = locatestarttagend_tolerant .match (rawdata , i )
385
- if m :
386
- j = m .end ()
387
- next = rawdata [j :j + 1 ]
388
- if next == ">" :
389
- return j + 1
390
- if next == "/" :
391
- if rawdata .startswith ("/>" , j ):
392
- return j + 2
393
- if rawdata .startswith ("/" , j ):
394
- # buffer boundary
395
- return - 1
396
- # else bogus input
397
- if j > i :
398
- return j
399
- else :
400
- return i + 1
401
- if next == "" :
402
- # end of input
403
- return - 1
404
- if next in ("abcdefghijklmnopqrstuvwxyz=/"
405
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ):
406
- # end of input in or before attribute value, or we have the
407
- # '/' from a '/>' ending
408
- return - 1
409
- if j > i :
410
- return j
411
- else :
412
- return i + 1
413
- raise AssertionError ("we should not get here!" )
413
+ match = locatetagend .match (rawdata , i + 1 )
414
+ assert match
415
+ j = match .end ()
416
+ if rawdata [j - 1 ] != ">" :
417
+ return - 1
418
+ return j
414
419
415
420
# Internal -- parse endtag, return end or -1 if incomplete
416
421
def parse_endtag (self , i ):
422
+ # See the HTML5 specs section "13.2.5.7 End tag open state"
423
+ # https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
417
424
rawdata = self .rawdata
418
425
assert rawdata [i :i + 2 ] == "</" , "unexpected call to parse_endtag"
419
- match = endendtag .search (rawdata , i + 1 ) # >
420
- if not match :
426
+ if rawdata .find ('>' , i + 2 ) < 0 : # fast check
421
427
return - 1
422
- gtpos = match .end ()
423
- match = endtagfind .match (rawdata , i ) # </ + tag + >
424
- if not match :
425
- if self .cdata_elem is not None :
426
- self .handle_data (rawdata [i :gtpos ])
427
- return gtpos
428
- # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
429
- namematch = tagfind_tolerant .match (rawdata , i + 2 )
430
- if not namematch :
431
- # w3.org/TR/html5/tokenization.html#end-tag-open-state
432
- if rawdata [i :i + 3 ] == '</>' :
433
- return i + 3
434
- else :
435
- return self .parse_bogus_comment (i )
436
- tagname = namematch .group (1 ).lower ()
437
- # consume and ignore other stuff between the name and the >
438
- # Note: this is not 100% correct, since we might have things like
439
- # </tag attr=">">, but looking for > after the name should cover
440
- # most of the cases and is much simpler
441
- gtpos = rawdata .find ('>' , namematch .end ())
442
- self .handle_endtag (tagname )
443
- return gtpos + 1
428
+ if not endtagopen .match (rawdata , i ): # </ + letter
429
+ if rawdata [i + 2 :i + 3 ] == '>' : # </> is ignored
430
+ # "missing-end-tag-name" parser error
431
+ return i + 3
432
+ else :
433
+ return self .parse_bogus_comment (i )
444
434
445
- elem = match . group ( 1 ). lower () # script or style
446
- if self . cdata_elem is not None :
447
- if elem != self . cdata_elem :
448
- self . handle_data ( rawdata [i : gtpos ])
449
- return gtpos
435
+ match = locatetagend . match ( rawdata , i + 2 )
436
+ assert match
437
+ j = match . end ()
438
+ if rawdata [j - 1 ] != ">" :
439
+ return - 1
450
440
451
- self .handle_endtag (elem )
441
+ # find the name: "13.2.5.8 Tag name state"
442
+ # https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
443
+ match = tagfind_tolerant .match (rawdata , i + 2 )
444
+ assert match
445
+ tag = match .group (1 ).lower ()
446
+ self .handle_endtag (tag )
452
447
self .clear_cdata_mode ()
453
- return gtpos
448
+ return j
454
449
455
450
# Overridable -- finish processing of start+end tag: <tag.../>
456
451
def handle_startendtag (self , tag , attrs ):
0 commit comments