10000 Adds Python 3.4 support. · Harry0201/python-readability@aa4132f · GitHub
[go: up one dir, main page]

Skip to content

Commit aa4132f

Browse files
committed
Adds Python 3.4 support.
Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support because of some issues with the parser and the difference between old and new `raise` syntax.
1 parent 13cca1d commit aa4132f

File tree

4 files changed

+35
-16
lines changed

4 files changed

+35
-16
lines changed

readability/htmls.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88

99
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
1010

11+
if sys.version_info[0] == 2:
12+
str = unicode
13+
1114
def build_doc(page):
12-
if isinstance(page, unicode):
15+
if isinstance(page, str):
1316
enc = None
1417
page_unicode = page
1518
else:
@@ -33,7 +36,7 @@ def normalize_entities(cur_title):
3336
u'\u00BB': '"',
3437
u'"': '"',
3538
}
36-
for c, r in entities.iteritems():
39+
for c, r in list(entities.items()):
3740
if c in cur_title:
3841
cur_title = cur_title.replace(c, r)
3942

@@ -105,7 +108,7 @@ def shorten_title(doc):
105108

106109
def get_body(doc):
107110
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
108-
raw_html = unicode(tostring(doc.body or doc))
111+
raw_html = str(tostring(doc.body or doc))
109112
cleaned = clean_attributes(raw_html)
110113
try:
111114
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

readability/readability.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
from __future__ import print_function
23
import logging
34
import re
45
import sys
@@ -20,6 +21,8 @@
2021
logging.basicConfig(level=logging.INFO)
2122
log = logging.getLogger()
2223

24+
if sys.version_info[0] == 2:
25+
str = unicode
2326

2427
REGEXES = {
2528
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@@ -81,11 +84,12 @@ def text_length(i):
8184
def compile_pattern(elements):
8285
if not elements:
8386
return None
84-
if isinstance(elements, regexp_type):
87+
elif isinstance(elements, regexp_type):
8588
return elements
86-
if isinstance(elements, basestring):
89+
else:
90+
# assume string or string like object
8791
elements = elements.split(',')
88-
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
92+
return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U)
8993

9094
class Document:
9195
"""Class to build a etree document out of html."""
@@ -195,9 +199,20 @@ def summary(self, html_partial=False):
195199
continue
196200
else:
197201
return cleaned_article
198-
except StandardError, e:
202+
except Exception as e:
199203
log.exception('error getting summary: ')
200-
raise Unparseable(str(e)), None, sys.exc_info()[2]
204+
if sys.version_info[0] == 2:
205+
# This is the only reason why we can't support Python 3.3:
206+
# 3.3s parser fails to accept the old syntax (although this
207+
# code never runs) which would require write this line as:
208+
# write this line as
209+
# Unparseable(str(e))
210+
# but then we loose the traceback information. 3.4 on the
211+
# other hand accepts the old syntax and would only complain
212+
# at runtime.
213+
raise Unparseable(str(e)), None, sys.exc_info()[2]
214+
else:
215+
raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
201216

202217
def get_article(self, candidates, best_candidate, html_partial=False):
203218
# Now that we have the top candidate, look through its siblings for
@@ -247,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False):
247262
return output
248263

249264
def select_best_candidate(self, candidates):
250-
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
265+
sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True)
251266
for candidate in sorted_candidates[:5]:
252267
elem = candidate['elem']
253268
self.debug("Top 5 : %6.3f %s" % (
@@ -388,7 +403,7 @@ def transform_misused_divs_into_paragraphs(self):
388403
# This results in incorrect results in case there is an <img>
389404
# buried within an <a> for example
390405
if not REGEXES['divToPElementsRe'].search(
391-
unicode(''.join(map(tostring, list(elem))))):
406+
str(''.join(map(str, map(tostring, list(elem)))))):
392407
#self.debug("Altering %s to p" % (describe(elem)))
393408
elem.tag = "p"
394409
#print "Fixed element "+describe(elem)
@@ -609,18 +624,18 @@ def main():
609624

610625
file = None
611626
if options.url:
612-
import urllib
613-
file = urllib.urlopen(options.url)
627+
import urllib.request, urllib.parse, urllib.error
628+
file = urllib.request.urlopen(options.url)
614629
else:
615630
file = open(args[0], 'rt')
616631
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
617632
try:
618-
print Document(file.read(),
633+
print(Document(file.read(),
619634
debug=options.verbose,
620635
url=options.url,
621636
positive_keywords = options.positive_keywords,
622637
negative_keywords = options.negative_keywords,
623-
).summary().encode(enc, 'replace')
638+
).summary().encode(enc, 'replace'))
624639
finally:
625640
file.close()
626641

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
from __future__ import print_function
23
from setuptools import setup, find_packages
34
import sys
45

@@ -8,7 +9,7 @@
89
mac_ver = platform.mac_ver()[0]
910
mac_ver_no = int(mac_ver.split('.')[1])
1011
if mac_ver_no < 9:
11-
print "Using lxml<2.4"
12+
print("Using lxml<2.4")
1213
lxml_requirement = "lxml<2.4"
1314

1415
setup(

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# and then run "tox" from this directory.
55

66
[tox]
7-
envlist = py26, py27
7+
envlist = py26, py27, py34
88

99
[testenv]
1010
deps=pytest

0 commit comments

Comments
 (0)
0