@@ -108,6 +108,7 @@ def tags(node, *tag_names):
108
108
for e in node .findall ('.//%s' % tag_name ):
109
109
yield e
110
110
111
+
111
112
def class_weight (e ):
112
113
weight = 0
113
114
if e .get ('class' , None ):
@@ -126,6 +127,7 @@ def class_weight(e):
126
127
127
128
return weight
128
129
130
+
129
131
def score_node (elem ):
130
132
content_score = class_weight (elem )
131
133
name = elem .tag .lower ()
@@ -146,7 +148,8 @@ def score_node(elem):
146
148
def transform_misused_divs_into_paragraphs (doc ):
147
149
for elem in tags (doc , 'div' ):
148
150
# transform <div>s that do not contain other block elements into <p>s
149
- if not REGEXES ['divToPElementsRe' ].search (unicode ('' .join (map (tostring , list (elem ))))):
151
+ if not REGEXES ['divToPElementsRe' ].search (
152
+ unicode ('' .join (map (tostring , list (elem ))))):
150
153
logging .debug ("Altering %s to p" % (describe (elem )))
151
154
elem .tag = "p"
152
155
#print "Fixed element "+describe(elem)
@@ -166,12 +169,15 @@ def transform_misused_divs_into_paragraphs(doc):
166
169
p .text = child .tail
167
170
child .tail = None
168
171
elem .insert (pos + 1 , p )
169
- logging .debug ("Inserted %s to %s" % (tounicode (p ), describe (elem )))
172
+ logging .debug ("Inserted %s to %s" % (
173
+ tounicode (p ),
174
+ describe (elem )))
170
175
#print "Inserted "+tounicode(p)+" to "+describe(elem)
171
176
if child .tag == 'br' :
172
177
#print 'Dropped <br> at '+describe(elem)
173
178
child .drop_tree ()
174
179
180
+
175
181
def remove_unlikely_candidates (doc ):
176
182
for elem in doc .iter ():
177
183
s = "%s %s" % (elem .get ('class' , '' ), elem .get ('id' , '' ))
@@ -184,6 +190,7 @@ def remove_unlikely_candidates(doc):
184
190
logging .debug ("Removing unlikely candidate - %s" % describe (elem ))
185
191
elem .drop_tree ()
186
192
193
+
187
194
def get_link_density (elem ):
188
195
link_length = 0
189
196
for i in elem .findall (".//a" ):
@@ -232,17 +239,23 @@ def score_paragraphs(doc, min_text_len):
232
239
if grand_parent_node is not None :
233
240
candidates [grand_parent_node ]['content_score' ] += content_score / 2.0
234
241
235
- # Scale the final candidates score based on link density. Good content should have a
236
- # relatively small link density (5% or less) and be mostly unaffected by this operation.
242
+ # Scale the final candidates score based on link density. Good content
243
+ # should have a relatively small link density (5% or less) and be mostly
244
+ # unaffected by this operation.
237
245
for elem in ordered :
238
246
candidate = candidates [elem ]
239
247
ld = get_link_density (elem )
240
248
score = candidate ['content_score' ]
241
- logging .debug ("Candid: %6.3f %s link density %.3f -> %6.3f" % (score , describe (elem ), ld , score * (1 - ld )))
249
+ logging .debug ("Candid: %6.3f %s link density %.3f -> %6.3f" % (
250
+ score ,
251
+ describe (elem ),
252
+ ld ,
253
+ score * (1 - ld )))
242
254
candidate ['content_score' ] *= (1 - ld )
243
255
244
256
return candidates
245
257
258
+
246
259
def select_best_candidate (candidates ):
247
260
sorted_candidates = sorted (candidates .values (),
248
261
key = lambda x : x ['content_score' ],
@@ -266,6 +279,7 @@ def reverse_tags(node, *tag_names):
266
279
for e in reversed (node .findall ('.//%s' % tag_name )):
267
280
yield e
268
281
282
+
269
283
def sanitize (node , candidates , min_text_len ):
270
284
for header in tags (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" ):
271
285
if class_weight (header ) < 0 or get_link_density (header ) > 0.33 :
@@ -293,10 +307,11 @@ def sanitize(node, candidates, min_text_len):
293
307
elif el .text_content ().count ("," ) < 10 :
294
308
counts = {}
295
309
for kind in ['p' , 'img' , 'li' , 'a' , 'embed' , 'input' ]:
296
- counts [kind ] = len (el .findall ('.//%s' % kind ))
310
+ counts [kind ] = len (el .findall ('.//%s' % kind ))
297
311
counts ["li" ] -= 100
298
312
299
- content_length = text_length (el ) # Count the text length excluding any surrounding whitespace
313
+ # Count the text length excluding any surrounding whitespace
314
+ content_length = text_length (el )
300
315
link_density = get_link_density (el )
301
316
parent_node = el .getparent ()
302
317
if parent_node is not None :
@@ -347,26 +362,26 @@ def sanitize(node, candidates, min_text_len):
347
362
348
363
#find x non empty preceding and succeeding siblings
349
364
i , j = 0 , 0
350
- x = 1
365
+ x = 1
351
366
siblings = []
352
367
for sib in el .itersiblings ():
353
368
#logging.debug(sib.text_content())
354
369
sib_content_length = text_length (sib )
355
370
if sib_content_length :
356
- i = + 1
371
+ i += 1
357
372
siblings .append (sib_content_length )
358
373
if i == x :
359
374
break
360
375
for sib in el .itersiblings (preceding = True ):
361
376
#logging.debug(sib.text_content())
362
377
sib_content_length = text_length (sib )
363
378
if sib_content_length :
364
- j = + 1
379
+ j += 1
365
380
siblings .append (sib_content_length )
366
381
if j == x :
367
382
break
368
383
#logging.debug(str(siblings))
369
- if siblings and sum (siblings ) > 1000 :
384
+ if siblings and sum (siblings ) > 1000 :
370
385
to_remove = False
371
386
logging .debug ("Allowing %s" % describe (el ))
372
387
for desnode in tags (el , "table" , "ul" , "div" ):
@@ -388,21 +403,22 @@ def sanitize(node, candidates, min_text_len):
388
403
389
404
390
405
def get_raw_article (candidates , best_candidate , enclose_with_html_tag = True ):
391
- # Now that we have the top candidate, look through its siblings for content that might also be related.
392
- # Things like preambles, content split by ads that we removed, etc.
393
-
406
+ # Now that we have the top candidate, look through its siblings for
407
+ # content that might also be related. Things like preambles, content
408
+ # split by ads that we removed, etc.
394
409
sibling_score_threshold = max ([10 , best_candidate ['content_score' ] * 0.2 ])
395
410
if enclose_with_html_tag :
396
411
output = document_fromstring ('<div/>' )
397
412
else :
398
413
output = fragment_fromstring ('<div/>' )
399
414
best_elem = best_candidate ['elem' ]
400
415
for sibling in best_elem .getparent ().getchildren ():
401
- #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
416
+ #if isinstance(sibling, NavigableString): continue#in lxml there no
417
+ # concept of simple text
402
418
append = False
403
419
if sibling is best_elem :
404
420
append = True
405
- sibling_key = sibling # HashableElement(sibling)
421
+ sibling_key = sibling # HashableElement(sibling)
406
422
407
423
# Print out sibling information for debugging.
408
424
if sibling_key in candidates :
@@ -476,16 +492,15 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
476
492
of_acceptable_length = len (cleaned_article or '' ) >= retry_len
477
493
if ruthless and not of_acceptable_length :
478
494
ruthless = False
479
- continue # try again
495
+ continue # try again
480
496
else :
481
497
return Summary (confidence = confidence ,
482
498
html = cleaned_article ,
483
499
short_title = shorten_title (doc ),
484
500
title = get_title (doc ))
485
501
486
502
except StandardError as e :
487
- #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
488
- logging .exception ('error getting summary: ' )
503
+ logging .exception ('error getting summary: ' )
489
504
raise Unparseable (str (e )), None , sys .exc_info ()[2 ]
490
505
491
506
@@ -533,6 +548,7 @@ def clean_segment_number(segments, index, segment):
533
548
else :
534
549
return segment
535
550
551
+
536
552
def clean_segment_index (segments , index , segment ):
537
553
if index == (len (segments ) - 1 ) and segment .lower () == 'index' :
538
554
return None
@@ -555,6 +571,7 @@ def clean_segment_short(segments, index, segment):
555
571
else :
556
572
return segment
557
573
574
+
558
575
def clean_segment (segments , index , segment ):
559
576
"""
560
577
Cleans a single segment of a URL to find the base URL. The base URL is as
@@ -613,6 +630,7 @@ def __init__(self, link_text, href):
613
630
self .href = href
614
631
self .score = 0
615
632
633
+
616
634
def same_domain (lhs , rhs ):
617
635
split_lhs = urlparse .urlsplit (lhs )
618
636
split_rhs = urlparse .urlsplit (rhs )
@@ -625,6 +643,7 @@ def same_domain(lhs, rhs):
625
643
def strip_trailing_slash (s ):
626
644
return re .sub (r'/$' , '' , s )
627
645
646
+
628
647
def eval_href (parsed_urls , url , base_url , link ):
629
648
raw_href = link .get ('href' )
630
649
if raw_href is None :
@@ -644,13 +663,15 @@ def eval_href(parsed_urls, url, base_url, link):
644
663
645
664
return raw_href , href , True
646
665
666
+
647
667
def eval_link_text (link ):
648
668
link_text = clean (link .text_content () or '' )
649
669
if REGEXES ['extraneous' ].search (link_text ) or len (link_text ) > 25 :
650
670
return link_text , False
651
671
else :
652
672
return link_text , True
653
673
674
+
654
675
def find_or_create_page (candidates , href , link_text ):
655
676
'''
656
677
Finds or creates a candidate page object for a next-page href. If one
@@ -666,6 +687,7 @@ def find_or_create_page(candidates, href, link_text):
666
687
candidates [href ] = candidate
667
688
return candidate , True
668
689
690
+
669
691
def eval_possible_next_page_link (
670
692
parsed_urls , url , base_url , candidates , link ):
671
693
0 commit comments