8000 Avoid reading the whole input when using chardet (by reading chunks o… · awesome-python/html5lib-python@5dea601 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5dea601

Browse files
committed
Avoid reading the whole input when using chardet (by reading chunks of bytes and feeding them to the chardet UniversalDetector until it detects the encoding with enough confidence)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40809
1 parent 32f1321 commit 5dea601

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

src/inputstream.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
4040
#Number of bytes to use when looking for a meta element with
4141
#encoding information
4242
self.numBytesMeta = 512
43+
#Number of bytes to use when using detecting encoding using chardet
44+
self.numBytesChardet = 100
4345
#Encoding to use if no other information can be found
4446
self.defaultEncoding = "windows-1252"
4547

@@ -84,10 +86,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
8486
#Guess with chardet, if avaliable
8587
if encoding is None and chardet:
8688
try:
87-
import chardet
88-
buffer = self.rawStream.read()
89-
encoding = chardet.detect(buffer)['encoding']
90-
self.seek(buffer, 0)
89+
from chardet.universaldetector import UniversalDetector
90+
buffers = []
91+
detector = UniversalDetector()
92+
while not detector.done:
93+
buffer = self.rawStream.read(self.numBytesChardet)
94+
if not buffer:
95+
break
96+
buffers.append(buffer)
97+
detector.feed(buffer)
98+
detector.close()
99+
encoding = detector.result['encoding']
100+
self.seek("".join(buffers), 0)
91101
except ImportError:
92102
pass
93103
# If all else fails use the default encoding

0 commit comments

Comments
 (0)
0