@@ -300,8 +300,41 @@ def eval_possible_next_page_link(
300
300
if REGEXES ['prevLink' ].search (link_data ):
301
301
candidate .score -= 200
302
302
303
- # TODO: Score ancestry.
304
- # TODO: Score a bunch of other stuff.
303
+ parent = link .getparent ()
304
+ positive_node_match = False
305
+ negative_node_match = False
306
+ while parent is not None :
307
+ parent_class = parent .get ('class' ) or ''
308
+ parent_id = parent .get ('id' ) or ''
309
+ parent_class_and_id = ' ' .join ([parent_class , parent_id ])
310
+ if not positive_node_match :
311
+ if REGEXES ['page' ].search (parent_class_and_id ):
312
+ positive_node_match = True
313
+ candidate .score += 25
314
+ if not negative_node_match :
315
+ if REGEXES ['negativeRe' ].search (parent_class_and_id ):
316
+ if not REGEXES ['positiveRe' ].search (parent_class_and_id ):
317
+ negative_node_match = True
318
+ candidate .score -= 25
319
+ parent = parent .getparent ()
320
+
321
+ if REGEXES ['page' ].search (href ):
322
+ candidate .score += 25
323
+
324
+ if REGEXES ['extraneous' ].search (href ):
325
+ candidate .score -= 15
326
+
327
+ try :
328
+ link_text_as_int = int (link_text )
329
+
330
+ # Punish 1 since we're either already there, or it's probably before
331
+ # what we want anyways.
332
+ if link_text_as_int == 1 :
333
+ candidate .score -= 10
334
+ else :
335
+ candidate .score += max (0 , 10 - link_text_as_int )
336
+ except ValueError as e :
337
+ pass
305
338
306
339
def find_next_page_link (parsed_urls , url , elem ):
307
340
links = tags (elem , 'a' )
0 commit comments