Add scoring of next page link ancestry and href

jcharum · mitechie · commit 32d1764e83a5 · 2012-04-21T13:14:25.000-04:00
This adds the scoring of next page link candidates' ancestry and href values
from the readability algorithm.
diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py
@@ -300,8 +300,41 @@ def eval_possible_next_page_link(
     if REGEXES['prevLink'].search(link_data):
         candidate.score -= 200
 
-    # TODO: Score ancestry.
-    # TODO: Score a bunch of other stuff.
+    parent = link.getparent()
+    positive_node_match = False
+    negative_node_match = False
+    while parent is not None:
+        parent_class = parent.get('class') or ''
+        parent_id = parent.get('id') or ''
+        parent_class_and_id = ' '.join([parent_class, parent_id])
+        if not positive_node_match:
+            if REGEXES['page'].search(parent_class_and_id):
+                positive_node_match = True
+                candidate.score += 25
+        if not negative_node_match:
+            if REGEXES['negativeRe'].search(parent_class_and_id):
+                if not REGEXES['positiveRe'].search(parent_class_and_id):
+                    negative_node_match = True
+                    candidate.score -= 25
+        parent = parent.getparent()
+
+    if REGEXES['page'].search(href):
+        candidate.score += 25
+
+    if REGEXES['extraneous'].search(href):
+        candidate.score -= 15
+
+    try:
+        link_text_as_int = int(link_text)
+
+        # Punish 1 since we're either already there, or it's probably before
+        # what we want anyways.
+        if link_text_as_int == 1:
+            candidate.score -= 10
+        else:
+            candidate.score += max(0, 10 - link_text_as_int)
+    except ValueError as e:
+        pass
 
 def find_next_page_link(parsed_urls, url, elem):
     links = tags(elem, 'a')