8000 Update encoding detection to spec, make the entire encoding detection… · gsnedders/html5lib-python@47b430e · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 47b430e

Browse files
committed
Update encoding detection to spec, make the entire encoding detection case-insensitive (it is effectively in spec), and fix bug shown by the update in the encoding stream.
1 parent 19f48dd commit 47b430e

File tree

1 file changed

+37
-35
lines changed

1 file changed

+37
-35
lines changed

src/html5lib/inputstream.py

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ class EncodingBytes(str):
471471
If the position is ever greater than the string length then an exception is
472472
raised"""
473473
def __new__(self, value):
474-
return str.__new__(self, value)
474+
return str.__new__(self, value.lower())
475475

476476
def __init__(self, value):
477477
self._position=-1
@@ -539,14 +539,12 @@ def skipUntil(self, chars):
539539
self._position = p
540540
return None
541541

542-
def matchBytes(self, bytes, lower=False):
542+
def matchBytes(self, bytes):
543543
"""Look for a sequence of bytes at the start of a string. If the bytes
544544
are found return True and advance the position to the byte after the
545545
match. Otherwise return False and leave the position alone"""
546546
p = self.position
547547
data = self[p:p+len(bytes)]
548-
if lower:
549-
data = data.lower()
550548
rv = data.startswith(bytes)
551549
if rv:
552550
self.position += len(bytes)
@@ -557,6 +555,9 @@ def jumpTo(self, bytes):
557555
a match is found advance the position to the last byte of the match"""
558556
newPosition = self[self.position:].find(bytes)
559557
if newPosition > -1:
558+
# XXX: This is ugly, but I can't see a nicer way to fix this.
559+
if self._position == -1:
560+
self._position = 0
560561
self._position += (newPosition + len(bytes)-1)
561562
return True
562563
else:
@@ -581,7 +582,7 @@ def getEncoding(self):
581582
for byte in self.data:
582583
keepParsing = True
583584
for key, method in methodDispatch:
584-
if self.data.matchBytes(key, lower=True):
585+
if self.data.matchBytes(key):
585586
try:
586587
keepParsing = method()
587588
break
@@ -659,72 +660,75 @@ def getAttribute(self):
659660
"""Return a name,value pair for the next attribute in the stream,
660661
if one is found, or None"""
661662
data = self.data
663+
# Step 1 (skip chars)
662664
c = data.skip(spaceCharactersBytes | frozenset("/"))
663-
if c == "<":
664-
data.previous()
665-
return None
666-
elif c == ">" or c is None:
665+
# Step 2
666+
8000 if c in (">", None):
667667
return None
668+
# Step 3
668669
attrName = []
669670
attrValue = []
670-
spaceFound = False
671-
#Step 5 attribute name
671+
#Step 4 attribute name
672672
while True:
673673
if c == "=" and attrName:
674674
break
675675
elif c in spaceCharactersBytes:
676-
spaceFound=True
676+
#Step 6!
677+
c = data.skip()
678+
c = data.next()
677679
break
678-
elif c in ("/", "<", ">"):
680+
elif c in ("/", ">"):
679681
return "".join(attrName), ""
680682
elif c in asciiUppercaseBytes:
681683
attrName.append(c.lower())
684+
elif c == None:
685+
return None
682686
else:
683687
attrName.append(c)
684-
#Step 6
688+
#Step 5
685689
c = data.next()
686690
#Step 7
687-
if spaceFound:
688-
c = data.skip()
689-
#Step 8
690-
if c != "=":
691-
data.previous()
692-
return "".join(attrName), ""
693-
#XXX need to advance position in both spaces and value case
694-
#Step 9
691+
if c != "=":
692+
data.previous()
693+
return "".join(attrName), ""
694+
#Step 8
695695
data.next()
696-
#Step 10
696+
#Step 9
697697
c = data.skip()
698-
#Step 11
698+
#Step 10
699699
if c in ("'", '"'):
700-
#11.1
700+
#10.1
701701
quoteChar = c
702702
while True:
703-
#11.3
703+
#10.2
704704
c = data.next()
705+
#10.3
705706
if c == quoteChar:
706707
data.next()
707708
return "".join(attrName), "".join(attrValue)
708-
#11.4
709+
#10.4
709710
elif c in asciiUppercaseBytes:
710711
attrValue.append(c.lower())
711-
#11.5
712+
#10.5
712713
else:
713714
attrValue.append(c)
714-
elif c in (">", "<"):
715+
elif c == ">":
715716
return "".join(attrName), ""
716717
elif c in asciiUppercaseBytes:
717718
attrValue.append(c.lower())
718719
elif c is None:
719720
return None
720721
else:
721722
attrValue.append(c)
723+
# Step 11
722724
while True:
723725
c = data.next()
724726
if c in spacesAngleBrackets:
725727
return "".join(attrName), "".join(attrValue)
726728
elif c in asciiUppercaseBytes:
727729
attrValue.append(c.lower())
730+
elif c is None:
731+
return None
728732
else:
729733
attrValue.append(c)
730734

@@ -734,10 +738,6 @@ def __init__(self, data):
734738
self.data = data
735739
def parse(self):
736740
try:
737-
#Skip to the first ";"
738-
self.data.jumpTo(";")
739-
self.data.position += 1
740-
self.data.skip()
741741
#Check if the attr name is charset
742742
#otherwise return
743743
self.data.jumpTo("charset")
@@ -753,8 +753,10 @@ def parse(self):
753753
quoteMark = self.data.currentByte
754754
self.data.position += 1
755755
oldPosition = self.data.position
756-
self.data.jumpTo(quoteMark)
757-
return self.data[oldPosition:self.data.position]
756+
if self.data.jumpTo(quoteMark):
757+
return self.data[oldPosition:self.data.position]
758+
else:
759+
return None
758760
else:
759761
#Unquoted value
760762
oldPosition = self.data.position

0 commit comments

Comments
 (0)
0