8000 Fixes regex declaration in get_encoding. · Harry0201/python-readability@046d2c1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 046d2c1

Browse files
committed
Fixes regex declaration in get_encoding.
Since get_encoding() is only called when the input is *not* already unicode we need to declare the regexs as byte type so they continue to work in Python 3.
1 parent ce7ca26 commit 046d2c1

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

readability/encoding.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33

44
def get_encoding(page):
55
# Regex for XML and HTML Meta charset declaration
6-
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
7-
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
8-
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
6+
charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
7+
pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
8+
xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
99

1010
declared_encodings = (charset_re.findall(page) +
1111
pragma_re.findall(page) +
@@ -21,7 +21,7 @@ def get_encoding(page):
2121
pass
2222

2323
# Fallback to chardet if declared encodings fail
24-
text = re.sub('</?[^>]*>\s*', ' ', page)
24+
text = re.sub(b'</?[^>]*>\s*', b' ', page)
2525
enc = 'utf-8'
2626
if not text.strip() or len(text) < 10:
2727
return enc # can't guess

0 commit comments

Comments
 (0)
0