8000 Merge pull request #64 from martinth/master · AramZS/python-readability@1546587 · GitHub 8000
[go: up one dir, main page]

Skip to content

Commit 1546587

Browse files
committed
Merge pull request buriy#64 from martinth/master
Added python 3 support (Supported: python 2.6, 2.7, 3.3, 3.4). Thanks a lot to @martinth
2 parents 83a7ce6 + 386e48d commit 1546587

File tree

10 files changed

+84
-24
lines changed

10 files changed

+84
-24
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@ dist
99
/man
1010
nosetests.xml
1111
.coverage
12+
.tox
13+
.idea
14+
.cache

readability/compat/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""
2+
This module contains compatibility helpers for Python 2/3 interoperability.
3+
4+
It mainly exists because their are certain incompatibilities in the Python
5+
syntax that can only be solved by conditionally importing different functions.
6+
"""

readability/compat/three.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
2+
"""
3+
Raise a new exception of type `exc_type` with an existing `traceback`. All
4+
additional (keyword-)arguments are forwarded to `exc_type`
5+
"""
6+
raise exc_type(*args, **kwargs).with_traceback(traceback)

readability/compat/two.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
2+
"""
3+
Raise a new exception of type `exc_type` with an existing `traceback`. All
4+
additional (keyword-)arguments are forwarded to `exc_type`
5+
"""
6+
raise exc_type(*args, **kwargs), None, traceback

readability/encoding.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,33 @@
11
import re
22
import chardet
3+
import sys
34

45
def get_encoding(page):
56
# Regex for XML and HTML Meta charset declaration
6-
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
7-
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
8-
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
7+
charset_re = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
8+
pragma_re = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
9+
xml_re = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
910

1011
declared_encodings = (charset_re.findall(page) +
1112
pragma_re.findall(page) +
1213
xml_re.findall(page))
1314

1415
# Try any declared encodings
15-
if len(declared_encodings) > 0:
16-
for declared_encoding in declared_encodings:
17-
try:
18-
page.decode(custom_decode(declared_encoding))
19-
return custom_decode(declared_encoding)
20-
except UnicodeDecodeError:
21-
pass
16+
for declared_encoding in declared_encodings:
17+
try:
18+
if sys.version_info[0] == 3:
19+
# declared_encoding will actually be bytes but .decode() only
20+
# accepts `str` type. Decode blindly with ascii because no one should
21+
# ever use non-ascii characters in the name of an encoding.
22+
declared_encoding = declared_encoding.decode('ascii', 'replace')
23+
24+
page.decode(custom_decode(declared_encoding))
25+
return custom_decode(declared_encoding)
26+
except UnicodeDecodeError:
27+
pass
2228

2329
# Fallback to chardet if declared encodings fail
24-
text = re.sub('</?[^>]*>\s*', ' ', page)
30+
text = re.sub(b'</?[^>]*>\s*', b' ', page)
2531
enc = 'utf-8'
2632
if not text.strip() or len(text) < 10:
2733
return enc # can't guess

readability/htmls.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88

99
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
1010

11+
if sys.version_info[0] == 2:
12+
str = unicode
13+
1114
def build_doc(page):
12-
if isinstance(page, unicode):
15+
if isinstance(page, str):
1316
enc = None
1417
page_unicode = page
1518
else:
@@ -33,7 +36,7 @@ def normalize_entities(cur_title):
3336
u'\u00BB': '"',
3437
u'&quot;': '"',
3538
}
36-
for c, r in entities.iteritems():
39+
for c, r in entities.items():
3740
if c in cur_title:
3841
cur_title = cur_title.replace(c, r)
3942

@@ -105,7 +108,7 @@ def shorten_title(doc):
105108

106109
def get_body(doc):
107110
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
108-
raw_html = unicode(tostring(doc.body or doc))
111+
raw_html = str(tostring(doc.body or doc))
109112
cleaned = clean_attributes(raw_html)
110113
try:
111114
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

readability/readability.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
from __future__ import print_function
23
import logging
34
import re
45
import sys
@@ -19,6 +20,8 @@
1920

2021
log = logging.getLogger()
2122

23+
if sys.version_info[0] == 2:
24+
str = unicode
2225

2326
REGEXES = {
2427
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@@ -80,11 +83,12 @@ def text_length(i):
8083
def compile_pattern(elements):
8184
if not elements:
8285
return None
83-
if isinstance(elements, regexp_type):
86+
elif isinstance(elements, regexp_type):
8487
return elements
85-
if isinstance(elements, basestring):
88+
else:
89+
# assume string or string like object
8690
elements = elements.split(',')
87-
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
91+
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
8892

8993
class Document:
9094
"""Class to build a etree document out of html."""
@@ -194,9 +198,13 @@ def summary(self, html_partial=False):
194198
continue
195199
else:
196200
return cleaned_article
197-
except StandardError, e:
201+
except Exception as e:
198202
log.exception('error getting summary: ')
199-
raise Unparseable(str(e)), None, sys.exc_info()[2]
203+
if sys.version_info[0] == 2:
204+
from .compat.two import raise_with_traceback
205+
else:
206+
from .compat.three import raise_with_traceback
207+
raise_with_traceback(Unparseable, sys.exc_info()[2], str(e))
200208

201209
def get_article(self, candidates, best_candidate, html_partial=False):
202210
# Now that we have the top candidate, look through its siblings for
@@ -389,7 +397,7 @@ def transform_misused_divs_into_paragraphs(self):
389397
# This results in incorrect results in case there is an <img>
390398
# buried within an <a> for example
391399
if not REGEXES['divToPElementsRe'].search(
392-
unicode(''.join(map(tostring, list(elem))))):
400+
str(''.join(map(str, map(tostring, list(elem)))))):
393401
#self.debug("Altering %s to p" % (describe(elem)))
394402
elem.tag = "p"
395403
#print "Fixed element "+describe(elem)
@@ -612,18 +620,18 @@ def main():
612620

613621
file = None
614622
if options.url:
615-
import urllib
616-
file = urllib.urlopen(options.url)
623+
import urllib.request, urllib.parse, urllib.error
624+
file = urllib.request.urlopen(options.url)
617625
else:
618626
file = open(args[0], 'rt')
619627
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
620628
try:
621-
print Document(file.read(),
629+
print(Document(file.read(),
622630
debug=options.verbose,
623631
url=options.url,
624632
positive_keywords = options.positive_keywords,
625633
negative_keywords = options.negative_keywords,
626-
).summary().encode(enc, 'replace')
634+
).summary().encode(enc, 'replace'))
627635
finally:
628636
file.close()
629637

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-e .

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
from __future__ import print_function
23
from setuptools import setup, find_packages
34
import sys
45

tox.ini

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Tox (http://tox.testrun.org/) is a tool for running tests
2+
# in multiple virtualenvs. This configuration file will run the
3+
# test suite on all supported python versions. To use it, "pip install tox"
4+
# and then run "tox" from this directory.
5+
6+
[tox]
7+
envlist = py26, py27, py33, py34
8+
9+
[testenv]
10+
deps=pytest
11+
# This creates the virtual envs with --site-packages so already packages
12+
# that are already installed will be reused. This is especially useful on
13+
# Windows. Since we use lxml instead of compiling it locally (which in turn
14+
# requires a Compiler and the build dependencies), you can download
15+
# it from http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml and install it via
16+
# $PYTHONDIR\Scripts\pip.exe install *.whl
17+
sitepackages=True
18+
commands =
19+
pip install -r requirements.txt
20+
py.test

0 commit comments

Comments
 (0)
0