2
2
import chardet
3
3
4
4
def get_encoding (page ):
5
+ # Regex for XML and HTML Meta charset declaration
6
+ charset_re = re .compile (r'<meta.*?charset=["\']*(.+?)["\'>]' , flags = re .I )
7
+ pragma_re = re .compile (r'<meta.*?content=["\']*;?charset=(.+?)["\'>]' , flags = re .I )
8
+ xml_re = re .compile (r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' )
9
+
10
+ declared_encodings = (charset_re .findall (page ) +
11
+ pragma_re .findall (page ) +
12
+ xml_re .findall (page ))
13
+
14
+ # Try any declared encodings
15
+ if len (declared_encodings ) > 0 :
16
+ for declared_encoding in declared_encodings :
17
+ try :
18
+ page .decode (custom_decode (declared_encoding ))
19
+ return custom_decode (declared_encoding )
20
+ except UnicodeDecodeError :
21
+ pass
22
+
23
+ # Fallback to chardet if declared encodings fail
5
24
text = re .sub ('</?[^>]*>\s*' , ' ' , page )
6
25
enc = 'utf-8'
7
26
if not text .strip () or len (text ) < 10 :
8
27
return enc # can't guess
9
- try :
10
- diff = text .decode (enc , 'ignore' ).encode (enc )
11
- sizes = len (diff ), len (text )
12
- if abs (len (text ) - len (diff )) < max (sizes ) * 0.01 : # 99% of utf-8
13
- return enc
14
- except UnicodeDecodeError :
15
- pass
16
28
res = chardet .detect (text )
17
29
enc = res ['encoding' ]
18
30
#print '->', enc, "%.2f" % res['confidence']
19
- if enc == 'MacCyrillic' :
20
- enc = 'cp1251'
31
+ enc = custom_decode (enc )
21
32
return enc
33
+
34
+ def custom_decode (encoding ):
35
+ """Overrides encoding when charset declaration
36
+ or charset determination is a subset of a larger
37
+ charset. Created because of issues with Chinese websites"""
38
+ encoding = encoding .lower ()
39
+ alternates = {
40
+ 'big5' : 'big5hkscs' ,
41
+ 'gb2312' : 'gb18030' ,
42
+ 'ascii' : 'utf-8' ,
43
+ 'MacCyrillic' : 'cp1251' ,
44
+ }
45
+ if encoding in alternates :
46
+ return alternates [encoding ]
47
+ else :
48
+ return encoding
0 commit comments