8
8
from lxml .html import document_fromstring
9
9
from lxml .html import fragment_fromstring
10
10
11
- from cleaners import clean_attributes
12
- from cleaners import html_cleaner
13
- from htmls import build_doc
14
- from htmls import get_body
15
- from htmls import get_title
16
- from htmls import shorten_title
11
+ from . cleaners import clean_attributes
12
+ from . cleaners import html_cleaner
13
+ from . htmls import build_doc
14
+ from . htmls import get_body
15
+ from . htmls import get_title
16
+ from . htmls import shorten_title
17
17
fro
F438
m encoding import get_encoding
18
18
from debug import describe , text_content , open_in_browser
19
19
20
20
log = logging .getLogger ('readbility.readability' )
21
+ StandardError = Exception in python3
21
22
22
23
REGEXES = {
23
24
'unlikelyCandidatesRe' : re .compile ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter' , re .I ),
@@ -68,7 +69,8 @@ def compile_pattern(elements):
68
69
return None
69
70
if isinstance (elements , regexp_type ):
70
71
return elements
71
- if isinstance (elements , basestring ):
72
+
73
+ if isinstance (elements , _basestring ):
72
74
elements = elements .split (',' )
73
75
return re .compile (u'|' .join ([re .escape (x .lower ()) for x in elements ]), re .U )
74
76
@@ -78,7 +80,8 @@ class Document:
78
80
TEXT_LENGTH_THRESHOLD = 25
79
81
RETRY_LENGTH = 250
80
82
81
- def __init__ (self , input , positive_keywords = None , negative_keywords = None , ** options ):
83
+ def __init__ (self , input , positive_keywords = None , negative_keywords = None ,
84
+ ** options ):
82
85
"""Generate the document
83
86
84
87
:param input: string of the html content.
@@ -88,8 +91,11 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti
88
91
- min_text_length:
89
92
- retry_length:
90
93
- url: will allow adjusting links to be absolute
91
- - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
92
- - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
94
+ - positive_keywords: the list of positive search patterns in
95
+ classes and ids, for example: ["news-item", "block"]
96
+ - negative_keywords: the list of negative
97
+ search patterns in classes
98
+ and ids, for example: ["mysidebar", "related", "ads"]
93
99
Also positive_keywords and negative_keywords could be a regexp.
94
100
"""
95
101
self .input = input
@@ -184,7 +190,7 @@ def summary(self, html_partial=False):
184
190
continue
185
191
else :
186
192
return cleaned_article
187
- except StandardError , e :
193
+ except StandardError as e :
188
194
log .exception ('error getting summary: ' )
189
195
raise Unparseable (str (e )), None , sys .exc_info ()[2 ]
190
196
@@ -208,7 +214,9 @@ def get_article(self, candidates, best_candidate, html_partial=False):
208
214
if sibling is best_elem :
209
215
append = True
210
216
sibling_key = sibling # HashableElement(sibling)
211
- if sibling_key in candidates and candidates [sibling_key ]['content_score' ] >= sibling_score_threshold :
217
+ if sibling_key in candidates and \
218
+ candidates [sibling_key ]['content_score' ] >= \
219
+ sibling_score_threshold :
212
220
append = True
213
221
214
222
if sibling .tag == "p" :
@@ -218,30 +226,37 @@ def get_article(self, candidates, best_candidate, html_partial=False):
218
226
219
227
if node_length > 80 and link_density < 0.25 :
220
228
append = True
221
- elif node_length <= 80 and link_density == 0 and re .search ('\.( |$)' , node_content ):
229
+ elif node_length <= 80 \
230
+ and link_density == 0 \
231
+ and re .search ('\.( |$)' , node_content ):
222
232
append = True
223
233
224
234
if append :
225
- # We don't want to append directly to output, but to the div
235
+ # We don't want to append directly to output, but the div
226
236
# in html->body->div
227
237
if html_partial :
228
238
output .append (sibling )
229
239
else :
230
240
output .getchildren ()[0 ].getchildren ()[0 ].append (sibling )
231
- #if output is not None:
232
- # output.append(best_elem)
241
+ # if output is not None:
242
+ # output.append(best_elem)
233
243
return output
234
244
235
245
def select_best_candidate (self , candidates ):
236
246
if not candidates :
237
247
return None
238
248
239
- sorted_candidates = sorted (candidates .values (), key = lambda x : x ['content_score' ], reverse = True)
249
+ sorted_candidates = sorted (
250
+ candidates .values (),
251
+ key = lambda x : x ['content_score' ],
252
+ reverse = True
253
+ )
254
+
240
255
for candidate in sorted_candidates [:5 ]:
241
256
elem = candidate ['elem' ]
242
- log .info ("Top 5 : %6.3f %s: %s " % (
257
+ log .info ("Top 5 : %6.3f %s" % (
243
258
candidate ['content_score' ],
244
- describe (elem ), text_content ( elem ) ))
259
+ describe (elem )))
245
260
246
261
best_candidate = sorted_candidates [0 ]
247
262
return best_candidate
@@ -279,7 +294,8 @@ def score_paragraphs(self, ):
279
294
candidates [parent_node ] = self .score_node (parent_node )
280
295
ordered .append (parent_node )
281
296
282
- if grand_parent_node is not None and grand_parent_node not in candidates :
297
+ if grand_parent_node is not None and \
298
+ grand_parent_node not in candidates :
283
299
candidates [grand_parent_node ] = self .score_node (
284
300
grand_parent_node )
285
301
ordered .append (grand_parent_node )
@@ -318,16 +334,20 @@ def class_weight(self, e):
318
334
if REGEXES ['positiveRe' ].search (feature ):
319
335
weight += 25
320
336
321
- if self .positive_keywords and self .positive_keywords .search (feature ):
337
+ if self .positive_keywords and self .positive_keywords .search (
338
+ feature ):
322
339
weight += 25
323
340
324
- if self .negative_keywords and self .negative_keywords .search (feature ):
341
+ if self .negative_keywords and self .negative_keywords .search (
342
+ feature ):
325
343
weight -= 25
326
344
327
- if self .positive_keywords and self .positive_keywords .match ('tag-' + e .tag ):
<
10000
/td>345
+ if self .positive_keywords and self .positive_keywords .match (
346
+ 'tag-' + e .tag ):
328
347
weight += 25
329
348
330
- if self .negative_keywords and self .negative_keywords .match ('tag-' + e .tag ):
349
+ if self .negative_keywords and self .negative_keywords .match (
350
+ 'tag-' + e .tag ):
331
351
weight -= 25
332
352
333
353
return weight
@@ -365,33 +385,33 @@ def transform_misused_divs_into_paragraphs(self):
365
385
for elem in self .tags (self .html , 'div' ):
366
386
# transform <div>s that do not contain other block elements into
367
387
# <p>s
368
- #FIXME: The current implementation ignores all descendants that
388
+ # FIXME: The current implementation ignores all descendants that
369
389
# are not direct children of elem
370
390
# This results in incorrect results in case there is an <img>
371
391
# buried within an <a> for example
372
392
if not REGEXES ['divToPElementsRe' ].search (
373
393
unicode ('' .join (map (tostring , list (elem ))))):
374
- #self.debug("Altering %s to p" % describe(elem))
394
+ # self.debug("Altering %s to p" % describe(elem))
375
395
elem .tag = "p"
376
- #self.debug("Fixed element "+describe(elem))
396
+ # self.debug("Fixed element "+describe(elem))
377
397
378
398
for elem in self .tags (self .html , 'div' ):
379
399
if elem .text and elem .text .strip ():
380
400
p = fragment_fromstring ('<p/>' )
381
401
p .text = elem .text
382
402
elem .text = None
383
403
elem .insert (0 , p )
384
- #print "Appended "+tounicode(p)+" to "+describe(elem)
404
+ # print "Appended "+tounicode(p)+" to "+describe(elem)
385
405
386
406
for pos , child in reversed (list (enumerate (elem ))):
387
407
if child .tail and child .tail .strip ():
388
408
p = fragment_fromstring ('<p/>' )
389
409
p .text = child .tail
390
410
child .tail = None
391
411
elem .insert (pos + 1 , p )
392
- #print "Inserted "+tounicode(p)+" to "+describe(elem)
412
+ # print "Inserted "+tounicode(p)+" to "+describe(elem)
393
413
if child .tag == 'br' :
394
- #print 'Dropped <br> at '+describe(elem)
414
+ # print 'Dropped <br> at '+describe(elem)
395
415
child .drop_tree ()
396
416
397
417
def tags (self , node , * tag_names ):
@@ -407,7 +427,8 @@ def reverse_tags(self, node, *tag_names):
407
427
def sanitize (self , node , candidates ):
408
428
MIN_LEN = self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD )
409
429
for header in self .tags (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" ):
410
- if self .class_weight (header ) < 0 or self .get_link_density (header ) > 0.33 :
430
+ if self .class_weight (header ) < 0 or \
431
+ self .get_link_density (header ) > 0.33 :
411
432
header .drop_tree ()
412
433
413
434
for elem in self .tags (node , "form" , "iframe" , "textarea" ):
@@ -421,7 +442,7 @@ def sanitize(self, node, candidates):
421
442
weight = self .class_weight (el )
422
443
if el in candidates :
423
444
content_score = candidates [el ]['content_score' ]
424
- #print '!',el, '-> %6.3f' % content_score
445
+ # print '!',el, '-> %6.3f' % content_score
425
446
else :
426
447
content_score = 0
427
448
tag = el .tag
@@ -443,24 +464,26 @@ def sanitize(self, node, candidates):
443
464
parent_node = el .getparent ()
444
465
if parent_node is not None :
445
466
if parent_node in candidates :
446
- content_score = candidates [parent_node ]['content_score' ]
467
+ content_score = candidates [
468
+ parent_node ]['content_score' ]
447
469
else :
448
470
content_score = 0
449
- #if parent_node is not None:
450
- #pweight = self.class_weight(parent_node) + content_score
451
- #pname = describe(parent_node)
452
- #else:
453
- #pweight = 0
454
- #pname = "no parent"
471
+ # if parent_node is not None:
472
+ # pweight = self.class_weight(parent_node) + content_score
473
+ # pname = describe(parent_node)
474
+ # else:
475
+ # pweight = 0
476
+ # pname = "no parent"
455
477
to_remove = False
456
478
reason = ""
457
479
458
- #if el.tag == 'div' and counts["img"] >= 1:
459
- # continue
480
+ # if el.tag == 'div' and counts["img"] >= 1:
481
+ # continue
460
482
if content_length and counts ["img" ] * 100 >= content_length :
461
483
reason = "too many images (%s) for text " % counts ["img" ]
462
484
to_remove = True
463
- elif counts ["li" ] > counts ["p" ] and tag != "ul" and tag != "ol" :
485
+ elif counts ["li" ] > counts ["p" ] \
486
+ and tag != "ul" and tag != "ol" :
464
487
reason = "more <li>s than <p>s"
465
488
to_remove = True
466
489
elif counts ["input" ] > (counts ["p" ] / 3 ):
@@ -544,7 +567,7 @@ def sanitize(self, node, candidates):
544
567
545
568
for el in ([node ] + [n for n in node .iter ()]):
546
569
if not self .options .get ('attributes' , None ):
547
- #el.attrib = {} #FIXME:Checkout the effects of disabling this
570
+ # el.attrib = {} #FIXME:Checkout the effects of disabling this
548
571
pass
549
572
550
573
self .html = node
@@ -612,7 +635,8 @@ def main():
612
635
file = urllib .urlopen (options .url )
613
636
else :
614
637
file = open (args [0 ], 'rt' )
615
- output_encoding = sys .__stdout__ .encoding or 'utf-8' # XXX: a hack, better set PYTHONIOENCODING explicitly
638
+ output_encoding = sys .__stdout__ .encoding or 'utf-8'
639
+ # XXX: a hack, better set PYTHONIOENCODING explicitly
616
640
html = file .read () # bytes object
617
641
encoding = get_encoding (html )
618
642
html = html .decode (encoding )
0 commit comments