8000 Added code to check declared encodings first and check them · littletao08/python-readability@3a43a3f · GitHub
[go: up one dir, main page]

Skip to content

Commit 3a43a3f

Browse files
committed
Added code to check declared encodings first and check them
from kennethreitz/requests/utils.py. Also I added some superset encodings I have found in Chinese pages that are mishandled by chardet/character declarations.
1 parent 1a4d369 commit 3a43a3f

File tree

1 file changed

+36
-9
lines changed

1 file changed

+36
-9
lines changed

readability/encoding.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,47 @@
22
import chardet
33

44
def get_encoding(page):
5+
# Regex for XML and HTML Meta charset declaration
6+
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
7+
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
8+
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
9+
10+
declared_encodings = (charset_re.findall(page) +
11+
pragma_re.findall(page) +
12+
xml_re.findall(page))
13+
14+
# Try any declared encodings
15+
if len(declared_encodings) > 0:
16+
for declared_encoding in declared_encodings:
17+
try:
18+
page.decode(custom_decode(declared_encoding))
19+
return custom_decode(declared_encoding)
20+
except UnicodeDecodeError:
21+
pass
22+
23+
# Fallback to chardet if declared encodings fail
524
text = re.sub('</?[^>]*>\s*', ' ', page)
625
enc = 'utf-8'
726
if not text.strip() or len(text) < 10:
827
return enc # can't guess
9-
try:
10-
diff = text.decode(enc, 'ignore').encode(enc)
11-
sizes = len(diff), len(text)
12-
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
13-
return enc
14-
except UnicodeDecodeError:
15-
pass
1628
res = chardet.detect(text)
1729
enc = res['encoding']
1830
#print '->', enc, "%.2f" % res['confidence']
19-
if enc == 'MacCyrillic':
20-
enc = 'cp1251'
31+
enc = custom_decode(enc)
2132
return enc
33+
34+
def custom_decode(encoding):
35+
"""Overrides encoding when charset declaration
36+
or charset determination is a subset of a larger
37+
charset. Created because of issues with Chinese websites"""
38+
encoding = encoding.lower()
39+
alternates = {
40+
'big5': 'big5hkscs',
41+
'gb2312': 'gb18030',
42+
'ascii': 'utf-8',
43+
'MacCyrillic': 'cp1251',
44+
}
45+
if encoding in alternates:
46+
return alternates[encoding]
47+
else:
48+
return encoding

0 commit comments

Comments
 (0)
0