8000 Add scoring of next page link ancestry and href · Harry0201/python-readability@32d1764 · GitHub
[go: up one dir, main page]

Skip to content

Commit 32d1764

Browse files
jcharummitechie
authored andcommitted
Add scoring of next page link ancestry and href
This adds the scoring of next page link candidates' ancestry and href values from the readability algorithm.
1 parent 0951647 commit 32d1764

File tree

1 file changed

+35
-2
lines changed

1 file changed

+35
-2
lines changed

src/readability_lxml/readability.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,41 @@ def eval_possible_next_page_link(
300300
if REGEXES['prevLink'].search(link_data):
301301
candidate.score -= 200
302302

303-
# TODO: Score ancestry.
304-
# TODO: Score a bunch of other stuff.
303+
parent = link.getparent()
304+
positive_node_match = False
305+
negative_node_match = False
306+
while parent is not None:
307+
parent_class = parent.get('class') or ''
308+
parent_id = parent.get('id') or ''
309+
parent_class_and_id = ' '.join([parent_class, parent_id])
310+
if not positive_node_match:
311+
if REGEXES['page'].search(parent_class_and_id):
312+
positive_node_match = True
313+
candidate.score += 25
314+
if not negative_node_match:
315+
if REGEXES['negativeRe'].search(parent_class_and_id):
316+
if not REGEXES['positiveRe'].search(parent_class_and_id):
317+
negative_node_match = True
318+
candidate.score -= 25
319+
parent = parent.getparent()
320+
321+
if REGEXES['page'].search(href):
322+
candidate.score += 25
323+
324+
if REGEXES['extraneous'].search(href):
325+
candidate.score -= 15
326+
327+
try:
328+
link_text_as_int = int(link_text)
329+
330+
# Punish 1 since we're either already there, or it's probably before
331+
# what we want anyways.
332+
if link_text_as_int == 1:
333+
candidate.score -= 10
334+
else:
335+
candidate.score += max(0, 10 - link_text_as_int)
336+
except ValueError as e:
337+
pass
305338

306339
def find_next_page_link(parsed_urls, url, elem):
307340
links = tags(elem, 'a')

0 commit comments

Comments
 (0)
0