8000 WIP: update to support python2 and python3 · Harry0201/python-readability@8048160 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8048160

Browse files
committed
WIP: update to support python2 and python3
1 parent 71294f0 commit 8048160

File tree

4 files changed

+86
-55
lines changed

4 files changed

+86
-55
lines changed

readability/cleaners.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
1-
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
1+
# -*- encoding: utf-8 -*-
2+
3+
# strip out a set of nuisance html attributes that can mess up rendering
4+
# in RSS feeds
5+
26
import re
37
from lxml.html.clean import Cleaner
48

5-
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
9+
bad_attrs = ['width', 'height', 'style',
10+
'[-a-z]*color', 'background[-a-z]*', 'on*']
611
single_quoted = "'[^']+'"
712
double_quoted = '"[^"]+"'
813
non_space = '[^ "\'>]+'
9-
htmlstrip = re.compile("<" # open
10-
"([^>]+) " # prefix
11-
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
12-
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
14+
htmlstrip = re.compile("<" # open
15+
"([^>]+) " # prefix
16+
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
17+
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
1318
"([^>]*)" # postfix
1419
">" # end
1520
, re.I)
@@ -20,13 +25,15 @@ def clean_attributes(html):
2025
return html
2126

2227
def normalize_spaces(s):
23-
if not s: return ''
28+
if not s:
29+
return ''
2430
"""replace any sequence of whitespace
2531
characters with a single space"""
2632
return ' '.join(s.split())
2733

2834
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
2935
style=True, links=True, meta=False, add_nofollow=False,
30-
page_structure=False, processing_instructions=True, embedded=False,
31-
frames=False, forms=False, annoying_tags=False, remove_tags=None,
36+
page_structure=False, processing_instructions=True,
37+
embedded=False, frames=False, forms=False,
38+
annoying_tags=False, remove_tags=None,
3239
remove_unknown_tags=False, safe_attrs_only=False)

readability/encoding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import chardet
33
import logging
44

5-
log = logging.getLogger('readbility.encoding')
5+
log = logging.getLogger(__name__)
66

77

88
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)

readability/htmls.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import lxml.html
66
import re
77

8-
log = logging.getLogger('readability.htmls')
8+
log = logging.getLogger(__name__)
99

1010
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
1111

readability/readability.py

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,17 @@
88
from lxml.html import document_fromstring
99
from lxml.html import fragment_fromstring
1010

11-
from cleaners import clean_attributes
12-
from cleaners import html_cleaner
13-
from htmls import build_doc
14-
from htmls import get_body
15-
from htmls import get_title
16-
from htmls import shorten_title
11+
from .cleaners import clean_attributes
12+
from .cleaners import html_cleaner
13+
from .htmls import build_doc
14+
from .htmls import get_body
15+
from .htmls import get_title
16+
from .htmls import shorten_title
1717
fro F438 m encoding import get_encoding
1818
from debug import describe, text_content, open_in_browser
1919

2020
log = logging.getLogger('readbility.readability')
21+
StandardError = Exception in python3
2122

2223
REGEXES = {
2324
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
@@ -68,7 +69,8 @@ def compile_pattern(elements):
6869
return None
6970
if isinstance(elements, regexp_type):
7071
return elements
71-
if isinstance(elements, basestring):
72+
73+
if isinstance(elements, _basestring):
7274
elements = elements.split(',')
7375
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
7476

@@ -78,7 +80,8 @@ class Document:
7880
TEXT_LENGTH_THRESHOLD = 25
7981
RETRY_LENGTH = 250
8082

81-
def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
83+
def __init__(self, input, positive_keywords=None, negative_keywords=None,
84+
**options):
8285
"""Generate the document
8386
8487
:param input: string of the html content.
@@ -88,8 +91,11 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti
8891
- min_text_length:
8992
- retry_length:
9093
- url: will allow adjusting links to be absolute
91-
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
92-
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
94+
- positive_keywords: the list of positive search patterns in
95+
classes and ids, for example: ["news-item", "block"]
96+
- negative_keywords: the list of negative
97+
search patterns in classes
98+
and ids, for example: ["mysidebar", "related", "ads"]
9399
Also positive_keywords and negative_keywords could be a regexp.
94100
"""
95101
self.input = input
@@ -184,7 +190,7 @@ def summary(self, html_partial=False):
184190
continue
185191
else:
186192
return cleaned_article
187-
except StandardError, e:
193+
except StandardError as e:
188194
log.exception('error getting summary: ')
189195
raise Unparseable(str(e)), None, sys.exc_info()[2]
190196

@@ -208,7 +214,9 @@ def get_article(self, candidates, best_candidate, html_partial=False):
208214
if sibling is best_elem:
209215
append = True
210216
sibling_key = sibling # HashableElement(sibling)
211-
if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
217+
if sibling_key in candidates and \
218+
candidates[sibling_key]['content_score'] >= \
219+
sibling_score_threshold:
212220
append = True
213221

214222
if sibling.tag == "p":
@@ -218,30 +226,37 @@ def get_article(self, candidates, best_candidate, html_partial=False):
218226

219227
if node_length > 80 and link_density < 0.25:
220228
append = True
221-
elif node_length <= 80 and link_density == 0 and re.search('\.( |$)', node_content):
229+
elif node_length <= 80 \
230+
and link_density == 0 \
231+
and re.search('\.( |$)', node_content):
222232
append = True
223233

224234
if append:
225-
# We don't want to append directly to output, but to the div
235+
# We don't want to append directly to output, but the div
226236
# in html->body->div
227237
if html_partial:
228238
output.append(sibling)
229239
else:
230240
output.getchildren()[0].getchildren()[0].append(sibling)
231-
#if output is not None:
232-
# output.append(best_elem)
241+
# if output is not None:
242+
# output.append(best_elem)
233243
return output
234244

235245
def select_best_candidate(self, candidates):
236246
if not candidates:
237247
return None
238248

239-
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
249+
sorted_candidates = sorted(
250+
candidates.values(),
251+
key=lambda x: x['content_score'],
252+
reverse=True
253+
)
254+
240255
for candidate in sorted_candidates[:5]:
241256
elem = candidate['elem']
242-
log.info("Top 5 : %6.3f %s: %s" % (
257+
log.info("Top 5 : %6.3f %s" % (
243258
candidate['content_score'],
244-
describe(elem), text_content(elem)))
259+
describe(elem)))
245260

246261
best_candidate = sorted_candidates[0]
247262
return best_candidate
@@ -279,7 +294,8 @@ def score_paragraphs(self, ):
279294
candidates[parent_node] = self.score_node(parent_node)
280295
ordered.append(parent_node)
281296

282-
if grand_parent_node is not None and grand_parent_node not in candidates:
297+
if grand_parent_node is not None and \
298+
grand_parent_node not in candidates:
283299
candidates[grand_parent_node] = self.score_node(
284300
grand_parent_node)
285301
ordered.append(grand_parent_node)
@@ -318,16 +334,20 @@ def class_weight(self, e):
318334
if REGEXES['positiveRe'].search(feature):
319335
weight += 25
320336

321-
if self.positive_keywords and self.positive_keywords.search(feature):
337+
if self.positive_keywords and self.positive_keywords.search(
338+
feature):
322339
weight += 25
323340

324-
if self.negative_keywords and self.negative_keywords.search(feature):
341+
if self.negative_keywords and self.negative_keywords.search(
342+
feature):
325343
weight -= 25
326344

327-
if self.positive_keywords and self.positive_keywords.match('tag-' + e.tag):
< 10000 /td>345+
if self.positive_keywords and self.positive_keywords.match(
346+
'tag-' + e.tag):
328347
weight += 25
329348

330-
if self.negative_keywords and self.negative_keywords.match('tag-' + e.tag):
349+
if self.negative_keywords and self.negative_keywords.match(
350+
'tag-' + e.tag):
331351
weight -= 25
332352

333353
return weight
@@ -365,33 +385,33 @@ def transform_misused_divs_into_paragraphs(self):
365385
for elem in self.tags(self.html, 'div'):
366386
# transform <div>s that do not contain other block elements into
367387
# <p>s
368-
#FIXME: The current implementation ignores all descendants that
388+
# FIXME: The current implementation ignores all descendants that
369389
# are not direct children of elem
370390
# This results in incorrect results in case there is an <img>
371391
# buried within an <a> for example
372392
if not REGEXES['divToPElementsRe'].search(
373393
unicode(''.join(map(tostring, list(elem))))):
374-
#self.debug("Altering %s to p" % describe(elem))
394+
# self.debug("Altering %s to p" % describe(elem))
375395
elem.tag = "p"
376-
#self.debug("Fixed element "+describe(elem))
396+
# self.debug("Fixed element "+describe(elem))
377397

378398
for elem in self.tags(self.html, 'div'):
379399
if elem.text and elem.text.strip():
380400
p = fragment_fromstring('<p/>')
381401
p.text = elem.text
382402
elem.text = None
383403
elem.insert(0, p)
384-
#print "Appended "+tounicode(p)+" to "+describe(elem)
404+
# print "Appended "+tounicode(p)+" to "+describe(elem)
385405

386406
for pos, child in reversed(list(enumerate(elem))):
387407
if child.tail and child.tail.strip():
388408
p = fragment_fromstring('<p/>')
389409
p.text = child.tail
390410
child.tail = None
391411
elem.insert(pos + 1, p)
392-
#print "Inserted "+tounicode(p)+" to "+describe(elem)
412+
# print "Inserted "+tounicode(p)+" to "+describe(elem)
393413
if child.tag == 'br':
394-
#print 'Dropped <br> at '+describe(elem)
414+
# print 'Dropped <br> at '+describe(elem)
395415
child.drop_tree()
396416

397417
def tags(self, node, *tag_names):
@@ -407,7 +427,8 @@ def reverse_tags(self, node, *tag_names):
407427
def sanitize(self, node, candidates):
408428
MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)
409429
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
410-
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
430+
if self.class_weight(header) < 0 or \
431+
self.get_link_density(header) > 0.33:
411432
header.drop_tree()
412433

413434
for elem in self.tags(node, "form", "iframe", "textarea"):
@@ -421,7 +442,7 @@ def sanitize(self, node, candidates):
421442
weight = self.class_weight(el)
422443
if el in candidates:
423444
content_score = candidates[el]['content_score']
424-
#print '!',el, '-> %6.3f' % content_score
445+
# print '!',el, '-> %6.3f' % content_score
425446
else:
426447
content_score = 0
427448
tag = el.tag
@@ -443,24 +464,26 @@ def sanitize(self, node, candidates):
443464
parent_node = el.getparent()
444465
if parent_node is not None:
445466
if parent_node in candidates:
446-
content_score = candidates[parent_node]['content_score']
467+
content_score = candidates[
468+
parent_node]['content_score']
447469
else:
448470
content_score = 0
449-
#if parent_node is not None:
450-
#pweight = self.class_weight(parent_node) + content_score
451-
#pname = describe(parent_node)
452-
#else:
453-
#pweight = 0
454-
#pname = "no parent"
471+
# if parent_node is not None:
472+
# pweight = self.class_weight(parent_node) + content_score
473+
# pname = describe(parent_node)
474+
# else:
475+
# pweight = 0
476+
# pname = "no parent"
455477
to_remove = False
456478
reason = ""
457479

458-
#if el.tag == 'div' and counts["img"] >= 1:
459-
# continue
480+
# if el.tag == 'div' and counts["img"] >= 1:
481+
# continue
460482
if content_length and counts["img"] * 100 >= content_length:
461483
reason = "too many images (%s) for text " % counts["img"]
462484
to_remove = True
463-
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
485+
elif counts["li"] > counts["p"] \
486+
and tag != "ul" and tag != "ol":
464487
reason = "more <li>s than <p>s"
465488
to_remove = True
466489
elif counts["input"] > (counts["p"] / 3):
@@ -544,7 +567,7 @@ def sanitize(self, node, candidates):
544567

545568
for el in ([node] + [n for n in node.iter()]):
546569
if not self.options.get('attributes', None):
547-
#el.attrib = {} #FIXME:Checkout the effects of disabling this
570+
# el.attrib = {} #FIXME:Checkout the effects of disabling this
548571
pass
549572

550573
self.html = node
@@ -612,7 +635,8 @@ def main():
612635
file = urllib.urlopen(options.url)
613636
else:
614637
file = open(args[0], 'rt')
615-
output_encoding = sys.__stdout__.encoding or 'utf-8' # XXX: a hack, better set PYTHONIOENCODING explicitly
638+
output_encoding = sys.__stdout__.encoding or 'utf-8'
639+
# XXX: a hack, better set PYTHONIOENCODING explicitly
616640
html = file.read() # bytes object
617641
encoding = get_encoding(html)
618642
html = html.decode(encoding)

0 commit comments

Comments
 (0)
0