10000 Clean up tests/changes to merge into 0.3.0.dev · stevejaker/python-readability@d708744 · GitHub
[go: up one dir, main page]

Skip to content

Commit d708744

Browse files
committed
Clean up tests/changes to merge into 0.3.0.dev
1 parent eefb8e1 commit d708744

File tree

4 files changed

+30
-11
lines changed

4 files changed

+30
-11
lines changed

src/readability_lxml/htmls.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,11 @@ def norm_title(title):
5959

6060

6161
def get_title(doc):
62-
title = doc.find('.//title').text
63-
if not title:
64-
return '[no-title]'
62+
title_node = doc.find('.//title')
6563

64+
if not title_node:
65+
return '[no-title]'
66+
title = title_node.text
6667
return norm_title(title)
6768

6869

@@ -74,10 +75,11 @@ def add_match(collection, text, orig):
7475

7576

7677
def shorten_title(doc):
77-
title = doc.find('.//title').text
78-
if not title:
78+
title_node = doc.find('.//title')
79+
if not title_node:
7980
return ''
8081

82+
title = title_node.text
8183
title = orig = norm_title(title)
8284

8385
candidates = set()

src/tests/test_article_only.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ def test_si_sample(self):
2828
sample,
2929
url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
3030
res = doc.summary()
31-
self.assertEqual('<html><body><div><div class', res[0:27])
31+
self.assertEqual('<html><body id="page"><div><div class', res[0:37])
3232

3333
def test_si_sample_html_partial(self):
3434
"""Using the si sample, make sure we can get the article alone."""
3535
sample = load_sample('si-game.sample.html')
3636
doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
3737
res = doc.summary(enclose_with_html_tag=False)
38-
self.assertEqual('<div><div class="', res[0:17])
38+
self.assertEqual('<div id="page"><div class="', res[0:27])
3939

4040
def test_si_sample_full_summary(self):
4141
"""We should parse the doc and get a full summary with confidence"""
@@ -50,7 +50,6 @@ def test_si_sample_full_summary(self):
5050
'res should have an titile attrib')
5151
self.assertTrue(hasattr(res, 'short_title'),
5252
'res should have an short_title attrib')
53-
self.assertEqual('<div><div class="', res.html[0:17])
53+
self.assertEqual('<div id="page"><div class="', res.html[0:27])
5454
self.assertTrue(res.confidence > 50,
5555
'The confidence score should be larger than 50: ' + str(res.confidence))
56-

src/tests/test_readability.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,3 +233,21 @@ def test_basic(self):
233233
for i in deletions:
234234
print('unexpected deletion: %s' % i.xpath('string()'))
235235
self.fail('readability result does not match expected')
236+
237+
238+
class TestIsSuspectedDuplicate(unittest.TestCase):
239+
240+
def setUp(self):
241+
super(TestIsSuspectedDuplicate, self).setUp()
242+
html = load_regression_data('duplicate-page-article.html')
243+
self._article = r.fragment_fromstring(html)
244+
245+
def test_unique(self):
246+
html = load_regression_data('duplicate-page-unique.html')
247+
page = r.fragment_fromstring(html)
248+
self.assertFalse(r.is_suspected_duplicate(self._article, page))
249+
250+
def test_duplicate(self):
251+
html = load_regression_data('duplicate-page-duplicate.html')
252+
page = r.fragment_fromstring(html)
253+
self.assertTrue(r.is_suspected_duplicate(self._article, page))

src/tests/test_sample_articles.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@ def process_article(article):
2020
sample = load_sample(article)
2121
doc = Document(sample)
2222
res = doc.summary()
23-
failed_msg = "Failed to process the article: " + article
24-
assert '<html><body><div><div class' == res[0:27], failed_msg
23+
failed_msg = "Failed to process the article: " + res[0:37]
24+
assert '<html><body id="page"><div><div class' == res[0:37], failed_msg

0 commit comments

Comments
 (0)
0