8000 Add regression tests for readability results · tim77/python-readability@7980ca8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7980ca8

Browse files
jcharummitechie
authored andcommitted
Add regression tests for readability results
These test cases provide a baseline from which we can start improving the readability algorithm and making sure that we do not horribly break anything. Conflicts: src/tests/regression.py
1 parent a700bb8 commit 7980ca8

24 files changed

+8745
-9
lines changed

src/tests/gen_test.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import errno
2+
import os
3+
import os.path
4+
import readability
5+
import sys
6+
import test
7+
import urllib2
8+
import yaml
9+
10+
OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
11+
12+
def y_or_n(question):
13+
while True:
14+
response = raw_input(question).strip()
15+
if len(response) > 0:
16+
return response[0] in ['y', 'Y']
17+
18+
def write_file(test_name, suffix, data):
19+
path = os.path.join(test.TEST_DATA_PATH, test_name + suffix)
20+ mode = 0644
21+
try:
22+
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
23+
except OSError as e:
24+
if e.errno == errno.EEXIST:
25+
if y_or_n(OVERWRITE_QUESTION % path):
26+
fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, mode)
27+
else:
28+
return False
29+
else:
30+
raise e
31+
f = os.fdopen(fd, 'w')
32+
f.write(data)
33+
return True
34+
35+
def gen_test(url, test_name, test_description):
36+
spec_dict = {'url': url, 'test_description': test_description}
37+
spec = yaml.dump(spec_dict, default_flow_style = False)
38+
if not write_file(test_name, test.YAML_EXTENSION, spec):
39+
return False
40+
41+
orig = urllib2.urlopen(url).read()
42+
if not write_file(test_name, test.ORIGINAL_SUFFIX, orig):
43+
return False
44+
45+
rdbl_doc = readability.Document(orig)
46+
summary = rdbl_doc.summary()
47+
if not write_file(test_name, test.READABLE_SUFFIX, summary.html):
48+
return False
49+
50+
return True
51+
52+
USAGE = '''
53+
usage: %s <url> <test name> <test description>
54+
'''
55+
56+
def usage(prog_name):
57+
print(USAGE % prog_name)
58+
59+
def main():
60+
if len(sys.argv) != 4:
61+
usage(sys.argv[0])
62+
return
63+
url = sys.argv[1]
64+
test_name = sys.argv[2]
65+
test_description = sys.argv[3]
66+
result = gen_test(url, test_name, test_description)
67+
if not result:
68+
print('test was not fully generated')
69+
70+
if __name__ == '__main__':
71+
main()

src/tests/regression.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
"""
2+
This module provides a regression test for results of running the readability
3+
algorithm on a variety of different real-world examples. For each page in the
4+
test suite, a benchmark was captured that represents the current readability
5+
results. Note that these are not necessarily ideal results, just the ones used
6+
as a benchmark.
7+
8+
This allows you to tweak and change the readability algorithm and see how it
9+
changes existing results, hopefully for the better.
10+
11+
"""
112
import lxml.html
213
import lxml.html.diff
314
import os
@@ -25,11 +36,14 @@
2536

2637
class ReadabilityTest:
2738

28-
def __init__(self, dir_path, enabled, name, desc, orig_path, rdbl_path):
39+
def __init__(
40+
self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path
41+
):
2942
self.dir_path = dir_path
3043
self.enabled = enabled
3144
self.name = name
3245
self.desc = desc
46+
self.notes = notes
3347
self.orig_path = orig_path
3448
self.rdbl_path = rdbl_path
3549

@@ -63,11 +77,16 @@ def make_readability_test(dir_path, name, spec_dict):
6377
enabled = spec_dict['enabled']
6478
else:
6579
enabled = True
80+
if 'notes' in spec_dict:
81+
notes = spec_dict['notes']
82+
else:
83+
notes = ''
6684
return ReadabilityTest(
6785
dir_path,
6886
enabled,
6987
name,
7088
spec_dict['test_description'],
89+
notes,
7190
make_path(dir_path, name, ORIGINAL_SUFFIX),
7291
make_path(dir_path, name, READABLE_SUFFIX)
7392
)
@@ -99,8 +118,6 @@ def execute_test(test_data):
99118
else:
100119
doc = readability.Document(test_data.orig_html)
101120
summary = doc.summary()
102-
benchmark_doc = (test_data.rdbl_html, 'benchmark')
103-
result_doc = (summary.html, 'result')
104121
diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
105122
return ReadabilityTestResult(test_data, summary.html, diff)
106123

@@ -139,12 +156,14 @@ class ResultSummary():
139156

140157
def __init__(self, result):
141158
doc = lxml.html.fragment_fromstring(result.diff_html)
159+
142160
insertions = doc.xpath('//ins')
143161
insertion_lengths = element_string_lengths(insertions)
144-
deletions = doc.xpath('//del')
145-
deletion_lengths = element_string_lengths(deletions)
146162
self.insertions = sum(insertion_lengths)
147163
self.insertion_blocks = len(insertions)
164+
165+
deletions = doc.xpath('//del')
166+
deletion_lengths = element_string_lengths(deletions)
148167
self.deletions = sum(deletion_lengths)
149168
self.deletion_blocks = len(deletions)
150169
pass
@@ -169,15 +188,17 @@ def output(suffix):
169188
B.A('result', href = output(RESULT_SUFFIX)),
170189
' ',
171190
B.A('diff', href = output(DIFF_SUFFIX))
172-
)
191+
),
192+
B.TD(test.notes)
173193
)
174194
else:
175195
return B.TR(
176196
B.CLASS('skipped'),
177197
B.TD('%s (SKIPPED)' % test.name),
178198
B.TD('N/A'),
179199
B.TD('N/A'),
180-
B.TD('N/A')
200+
B.TD('N/A'),
201+
B.TD(test.notes)
181202
)
182203

183204

@@ -187,7 +208,8 @@ def make_summary_doc(tests_w_results):
187208
B.TH('Test Name'),
188209
B.TH('Inserted (in # of blocks)'),
189210
B.TH('Deleted (in # of blocks)'),
190-
B.TH('Links')
211+
B.TH('Links'),
212+
B.TH('Notes')
191213
)
192214
)
193215
for (test, result) in tests_w_results:
@@ -253,7 +275,7 @@ def write_summary(path, tests_w_results):
253275

254276
def add_css(doc):
255277
style = B.STYLE(CSS, type = 'text/css')
256-
head = B.HEAD(style)
278+
head = B.HEAD(style, content = 'text/html; charset=utf-8')
257279
doc.insert(0, head)
258280

259281

0 commit comments

Comments
 (0)
0