1
1
import unittest
2
2
3
3
from readability_lxml .readability import Document
4
+ from readability_lxml import readability as r
4
5
5
6
6
7
class TestReadabilityDocument (unittest .TestCase ):
@@ -11,3 +12,101 @@ def test_none_input_raises_exception(self):
11
12
12
13
doc = None
13
14
self .assertRaises (ValueError , Document , doc )
15
+
16
+
17
+ class TestFindBaseUrl (unittest .TestCase ):
18
+
19
+ def setUp (self ):
20
+ self .longMessage = True
21
+
22
+ def _assert_url (self , url , expected_base_url , msg = None ):
23
+ actual_base_url = r .find_base_url (url )
24
+ self .assertEqual (expected_base_url , actual_base_url , msg )
25
+
26
+ def _run_urls (self , specs ):
27
+ """
28
+ Asserts expected results on a sequence of specs, where each spec is a
29
+ pair: (URL, expected base URL).
30
+ """
31
+ for spec in specs :
32
+ url = spec [0 ]
33
+ expected = spec [1 ]
34
+ if len (spec ) > 2 :
35
+ msg = spec [2 ]
36
+ else :
37
+ msg = None
38
+ self ._assert_url (url , expected , msg )
39
+
40
+ def test_none (self ):
41
+ self ._assert_url (None , None )
42
+
43
+ def test_no_change (self ):
44
+ url = 'http://foo.com/article'
45
+ self ._assert_url (url , url )
46
+
47
+ def test_extension_stripping (self ):
48
+ specs = [
49
+ (
50
+ 'http://foo.com/article.html' ,
51
+ 'http://foo.com/article' ,
52
+ 'extension should be stripped'
53
+ ),
54
+ (
55
+ 'http://foo.com/path/to/article.html' ,
56
+ 'http://foo.com/path/to/article' ,
57
+ 'extension should be stripped'
58
+ ),
59
+ (
60
+ 'http://foo.com/article.123not' ,
61
+ 'http://foo.com/article.123not' ,
62
+ '123not is not extension'
63
+ ),
64
+ (
65
+ 'http://foo.com/path/to/article.123not' ,
66
+ 'http://foo.com/path/to/article.123not' ,
67
+ '123not is not extension'
68
+ )
69
+ ]
70
+ self ._run_urls (specs )
71
+
72
+ def test_ewcms (self ):
73
+ self ._assert_url (
74
+ 'http://www.ew.com/ew/article/0,,20313460_20369436,00.html' ,
75
+ 'http://www.ew.com/ew/article/0,,20313460_20369436'
76
+ )
77
+
78
+ def test_page_numbers (self ):
79
+ specs = [
80
+ (
81
+ 'http://foo.com/page5.html' ,
82
+ 'http://foo.com' ,
83
+ 'page number should be stripped'
84
+ ),
85
+ (
86
+ 'http://foo.com/path/to/page5.html' ,
87
+ 'http://foo.com/path/to' ,
88
+ 'page number should be stripped'
89
+ ),
90
+ (
91
+ 'http://foo.com/article-5.html' ,
92
+ 'http://foo.com/article' ,
93
+ 'page number should be stripped'
94
+ )
95
+ ]
96
+ self ._run_urls (specs )
97
+
98
+ def test_numbers (self ):
99
+ specs = [
100
+ (
101
+ 'http://foo.com/5.html' ,
102
+ 'http://foo.com' ,
103
+ 'number should be stripped'
104
+ ),
105
+ (
106
+ 'http://foo.com/path/to/5.html' ,
107
+ 'http://foo.com/path/to' ,
108
+ 'number should be stripped'
109
+ )
110
+ ]
111
+ self ._run_urls (specs )
112
+
0 commit comments