Clean up tests/changes to merge into 0.3.0.dev

mitechie · mitechie · commit d708744822a3 · 2012-04-22T00:04:14.000-04:00
diff --git a/src/readability_lxml/htmls.py b/src/readability_lxml/htmls.py
@@ -59,10 +59,11 @@ def norm_title(title):
 
 
 def get_title(doc):
-    title = doc.find('.//title').text
-    if not title:
-        return '[no-title]'
+    title_node = doc.find('.//title')
 
+    if not title_node:
+        return '[no-title]'
+    title = title_node.text
     return norm_title(title)
 
 
@@ -74,10 +75,11 @@ def add_match(collection, text, orig):
 
 
 def shorten_title(doc):
-    title = doc.find('.//title').text
-    if not title:
+    title_node = doc.find('.//title')
+    if not title_node:
         return ''
 
+    title = title_node.text
     title = orig = norm_title(title)
 
     candidates = set()
diff --git a/src/tests/test_article_only.py b/src/tests/test_article_only.py
@@ -28,14 +28,14 @@ def test_si_sample(self):
             sample,
             url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
         res = doc.summary()
-        self.assertEqual('<html><body><div><div class', res[0:27])
+        self.assertEqual('<html><body id="page"><div><div class', res[0:37])
 
     def test_si_sample_html_partial(self):
         """Using the si sample, make sure we can get the article alone."""
         sample = load_sample('si-game.sample.html')
         doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html')
         res = doc.summary(enclose_with_html_tag=False)
-        self.assertEqual('<div><div class="', res[0:17])
+        self.assertEqual('<div id="page"><div class="', res[0:27])
 
     def test_si_sample_full_summary(self):
         """We should parse the doc and get a full summary with confidence"""
@@ -50,7 +50,6 @@ def test_si_sample_full_summary(self):
                 'res should have an titile attrib')
         self.assertTrue(hasattr(res, 'short_title'),
             'res should have an short_title attrib')
-        self.assertEqual('<div><div class="', res.html[0:17])
+        self.assertEqual('<div id="page"><div class="', res.html[0:27])
         self.assertTrue(res.confidence > 50,
             'The confidence score should be larger than 50: ' + str(res.confidence))
-
diff --git a/src/tests/test_readability.py b/src/tests/test_readability.py
@@ -233,3 +233,21 @@ def test_basic(self):
             for i in deletions:
                 print('unexpected deletion: %s' % i.xpath('string()'))
             self.fail('readability result does not match expected')
+
+
+class TestIsSuspectedDuplicate(unittest.TestCase):
+
+    def setUp(self):
+        super(TestIsSuspectedDuplicate, self).setUp()
+        html = load_regression_data('duplicate-page-article.html')
+        self._article = r.fragment_fromstring(html)
+
+    def test_unique(self):
+        html = load_regression_data('duplicate-page-unique.html')
+        page = r.fragment_fromstring(html)
+        self.assertFalse(r.is_suspected_duplicate(self._article, page))
+
+    def test_duplicate(self):
+        html = load_regression_data('duplicate-page-duplicate.html')
+        page = r.fragment_fromstring(html)
+        self.assertTrue(r.is_suspected_duplicate(self._article, page))
diff --git a/src/tests/test_sample_articles.py b/src/tests/test_sample_articles.py
@@ -20,5 +20,5 @@ def process_article(article):
     sample = load_sample(article)
     doc = Document(sample)
     res = doc.summary()
-    failed_msg = "Failed to process the article: " + article
-    assert '<html><body><div><div class' == res[0:27], failed_msg
+    failed_msg = "Failed to process the article: " + res[0:37]
+    assert '<html><body id="page"><div><div class' == res[0:37], failed_msg