8000 add url boost for spectrum · feedly/python-readability@9a0d318 · GitHub
[go: up one dir, main page]

Skip to content
This repository was archived by the owner on Jan 4, 2022. It is now read-only.

Commit 9a0d318

Browse files
committed
add url boost for spectrum
1 parent 0d0503f commit 9a0d318

File tree

3 files changed

+1271
-1
lines changed

3 files changed

+1271
-1
lines changed

readability/readability.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ def score_paragraphs(self):
401401

402402
def class_weight(self, e):
403403
weight = 0
404-
for feature in [e.get("class", None), e.get("id", None)]:
404+
for feature in [e.get("class", None), e.get("id", None), e.get("data-post-url")]:
405405
if feature:
406406
if REGEXES["negativeRe"].search(feature):
407407
weight -= 25
@@ -415,6 +415,9 @@ def class_weight(self, e):
415415
if self.negative_keywords and self.negative_keywords.search(feature):
416416
weight -= 25
417417

418+
if self.url in feature:
419+
weight += 500
420+
418421
if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag):
419422
weight += 25
420423

tests/samples/spectrum.html

Lines changed: 1261 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_article_only.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,9 @@ def test_utf8_kanji(self):
124124
sample = load_sample("utf-8-kanji.sample.html")
125125
doc = Document(sample)
126126
res = doc.summary()
127+
128+
def test_spectrum_ieee(self):
129+
sample = load_sample("spectrum.html")
130+
doc = Document(sample, url="https://spectrum.ieee.org/event-based-camera-chips")
131+
first_sentence = "The pixels in event-based cameras"
132+
self.assertIn(first_sentence, doc.summary())

0 commit comments

Comments
 (0)
0