10000 PEP8 again ... · Harry0201/python-readability@99efa5c · GitHub
[go: up one dir, main page]

Skip to content

Commit 99efa5c

Browse files
committed
PEP8 again ...
1 parent a012fd2 commit 99efa5c

File tree

1 file changed

+41
-19
lines changed

1 file changed

+41
-19
lines changed

src/readability_lxml/readability.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def tags(node, *tag_names):
108108
for e in node.findall('.//%s' % tag_name):
109109
yield e
110110

111+
111112
def class_weight(e):
112113
weight = 0
113114
if e.get('class', None):
@@ -126,6 +127,7 @@ def class_weight(e):
126127

127128
return weight
128129

130+
129131
def score_node(elem):
130132
content_score = class_weight(elem)
131133
name = elem.tag.lower()
@@ -146,7 +148,8 @@ def score_node(elem):
146148
def transform_misused_divs_into_paragraphs(doc):
147149
for elem in tags(doc, 'div'):
148150
# transform <div>s that do not contain other block elements into <p>s
149-
if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
151+
if not REGEXES['divToPElementsRe'].search(
152+
unicode(''.join(map(tostring, list(elem))))):
150153
logging.debug("Altering %s to p" % (describe(elem)))
151154
elem.tag = "p"
152155
#print "Fixed element "+describe(elem)
@@ -166,12 +169,15 @@ def transform_misused_divs_into_paragraphs(doc):
166169
p.text = child.tail
167170
child.tail = None
168171
elem.insert(pos + 1, p)
169-
logging.debug("Inserted %s to %s" % (tounicode(p), describe(elem)))
172+
logging.debug("Inserted %s to %s" % (
173+
tounicode(p),
174+
describe(elem)))
170175
#print "Inserted "+tounicode(p)+" to "+describe(elem)
171176
if child.tag == 'br':
172177
#print 'Dropped <br> at '+describe(elem)
173178
child.drop_tree()
174179

180+
175181
def remove_unlikely_candidates(doc):
176182
for elem in doc.iter():
177183
s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
@@ -184,6 +190,7 @@ def remove_unlikely_candidates(doc):
184190
logging.debug("Removing unlikely candidate - %s" % describe(elem))
185191
elem.drop_tree()
186192

193+
187194
def get_link_density(elem):
188195
link_length = 0
189196
for i in elem.findall(".//a"):
@@ -232,17 +239,23 @@ def score_paragraphs(doc, min_text_len):
232239
if grand_parent_node is not None:
233240
candidates[grand_parent_node]['content_score'] += content_score / 2.0
234241

235-
# Scale the final candidates score based on link density. Good content should have a
236-
# relatively small link density (5% or less) and be mostly unaffected by this operation.
242+
# Scale the final candidates score based on link density. Good content
243+
# should have a relatively small link density (5% or less) and be mostly
244+
# unaffected by this operation.
237245
for elem in ordered:
238246
candidate = candidates[elem]
239247
ld = get_link_density(elem)
240248
score = candidate['content_score']
241-
logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
249+
logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
250+
score,
251+
describe(elem),
252+
ld,
253+
score * (1 - ld)))
242254
candidate['content_score'] *= (1 - ld)
243255

244256
return candidates
245257

258+
246259
def select_best_candidate(candidates):
247260
sorted_candidates = sorted(candidates.values(),
248261
key=lambda x: x['content_score'],
@@ -266,6 +279,7 @@ def reverse_tags(node, *tag_names):
266279
for e in reversed(node.findall('.//%s' % tag_name)):
267280
yield e
268281

282+
269283
def sanitize(node, candidates, min_text_len):
270284
for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
271285
if class_weight(header) < 0 or get_link_density(header) > 0.33:
@@ -293,10 +307,11 @@ def sanitize(node, candidates, min_text_len):
293307
elif el.text_content().count(",") < 10:
294308
counts = {}
295309
for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
296-
counts[kind] = len(el.findall('.//%s' %kind))
310+
counts[kind] = len(el.findall('.//%s' % kind))
297311
counts["li"] -= 100
298312

299-
content_length = text_length(el) # Count the text length excluding any surrounding whitespace
313+
# Count the text length excluding any surrounding whitespace
314+
content_length = text_length(el)
300315
link_density = get_link_density(el)
301316
parent_node = el.getparent()
302317
if parent_node is not None:
@@ -347,26 +362,26 @@ def sanitize(node, candidates, min_text_len):
347362

348363
#find x non empty preceding and succeeding siblings
349364
i, j = 0, 0
350-
x = 1
365+
x = 1
351366
siblings = []
352367
for sib in el.itersiblings():
353368
#logging.debug(sib.text_content())
354369
sib_content_length = text_length(sib)
355370
if sib_content_length:
356-
i =+ 1
371+
i += 1
357372
siblings.append(sib_content_length)
358373
if i == x:
359374
break
360375
for sib in el.itersiblings(preceding=True):
361376
#logging.debug(sib.text_content())
362377
sib_content_length = text_length(sib)
363378
if sib_content_length:
364-
j =+ 1
379+
j += 1
365380
siblings.append(sib_content_length)
366381
if j == x:
367382
break
368383
#logging.debug(str(siblings))
369-
if siblings and sum(siblings) > 1000 :
384+
if siblings and sum(siblings) > 1000:
370385
to_remove = False
371386
logging.debug("Allowing %s" % describe(el))
372387
for desnode in tags(el, "table", "ul", "div"):
@@ -388,21 +403,22 @@ def sanitize(node, candidates, min_text_len):
388403

389404

390405
def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
391-
# Now that we have the top candidate, look through its siblings for content that might also be related.
392-
# Things like preambles, content split by ads that we removed, etc.
393-
406+
# Now that we have the top candidate, look through its siblings for
407+
# content that might also be related. Things like preambles, content
408+
# split by ads that we removed, etc.
394409
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
395410
if enclose_with_html_tag:
396411
output = document_fromstring('<div/>')
397412
else:
398413
output = fragment_fromstring('<div/>')
399414
best_elem = best_candidate['elem']
400415
for sibling in best_elem.getparent().getchildren():
401-
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
416+
#if isinstance(sibling, NavigableString): continue#in lxml there no
417+
# concept of simple text
402418
append = False
403419
if sibling is best_elem:
404420
append = True
405-
sibling_key = sibling #HashableElement(sibling)
421+
sibling_key = sibling # HashableElement(sibling)
406422

407423
# Print out sibling information for debugging.
408424
if sibling_key in candidates:
@@ -476,16 +492,15 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
476492
of_acceptable_length = len(cleaned_article or '') >= retry_len
477493
if ruthless and not of_acceptable_length:
478494
ruthless = False
479-
continue # try again
495+
continue # try again
480496
else:
481497
return Summary(confidence=confidence,
482498
html=cleaned_article,
483499
short_title=shorten_title(doc),
484500
title=get_title(doc))
485501

486502
except StandardError as e:
487-
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
488-
logging.exception('error getting summary: ' )
503+
logging.exception('error getting summary: ')
489504
raise Unparseable(str(e)), None, sys.exc_info()[2]
490505

491506

@@ -533,6 +548,7 @@ def clean_segment_number(segments, index, segment):
533548
else:
534549
return segment
535550

551+
536552
def clean_segment_index(segments, index, segment):
537553
if index == (len(segments) - 1) and segment.lower() == 'index':
538554
return None
@@ -555,6 +571,7 @@ def clean_segment_short(segments, index, segment):
555571
else:
556572
return segment
557573

574+
558575
def clean_segment(segments, index, segment):
559576
"""
560577
Cleans a single segment of a URL to find the base URL. The base URL is as
@@ -613,6 +630,7 @@ def __init__(self, link_text, href):
613630
self.href = href
614631
self.score = 0
615632

633+
616634
def same_domain(lhs, rhs):
617635
split_lhs = urlparse.urlsplit(lhs)
618636
split_rhs = urlparse.urlsplit(rhs)
@@ -625,6 +643,7 @@ def same_domain(lhs, rhs):
625643
def strip_trailing_slash(s):
626644
return re.sub(r'/$', '', s)
627645

646+
628647
def eval_href(parsed_urls, url, base_url, link):
629648
raw_href = link.get('href')
630649
if raw_href is None:
@@ -644,13 +663,15 @@ def eval_href(parsed_urls, url, base_url, link):
644663

645664
return raw_href, href, True
646665

666+
647667
def eval_link_text(link):
648668
link_text = clean(link.text_content() or '')
649669
if REGEXES['extraneous'].search(link_text) or len(link_text) > 25:
650670
return link_text, False
651671
else:
652672
return link_text, True
653673

674+
654675
def find_or_create_page(candidates, href, link_text):
655676
'''
656677
Finds or creates a candidate page object for a next-page href. If one
@@ -666,6 +687,7 @@ def find_or_create_page(candidates, href, link_text):
666687
candidates[href] = candidate
667688
return candidate, True
668689

690+
669691
def eval_possible_next_page_link(
670692
parsed_urls, url, base_url, candidates, link):
671693

0 commit comments

Comments
 (0)
0