tim77
diff --git a/‎src/tests/gen_test.py
Lines changed: 71 additions & 0 deletions b/‎src/tests/gen_test.py
Lines changed: 71 additions & 0 deletions
diff --git a/‎src/tests/regression.py
Lines changed: 31 additions & 9 deletions b/‎src/tests/regression.py
Lines changed: 31 additions & 9 deletions
@@ -0,0 +1,71 @@
+import errno
+import os
+import os.path
+import readability
+import sys
+import test
+import urllib2
+import yaml
+
+OVERWRITE_QUESTION = '%s exists; overwrite and continue (y/n)? '
+
+def y_or_n(question):
+    while True:
+        response = raw_input(question).strip()
+        if len(response) > 0:
+            return response[0] in ['y', 'Y']
+
+def write_file(test_name, suffix, data):
+    path = os.path.join(test.TEST_DATA_PATH, test_name + suffix)
+    mode = 0644
+    try:
+        fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
+    except OSError as e:
+        if e.errno == errno.EEXIST:
+            if y_or_n(OVERWRITE_QUESTION % path):
+                fd = os.open(path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, mode)
+            else:
+                return False
+        else:
+            raise e
+    f = os.fdopen(fd, 'w')
+    f.write(data)
+    return True
+
+def gen_test(url, test_name, test_description):
+    spec_dict = {'url': url, 'test_description': test_description}
+    spec = yaml.dump(spec_dict, default_flow_style = False)
+    if not write_file(test_name, test.YAML_EXTENSION, spec):
+        return False
+
+    orig = urllib2.urlopen(url).read()
+    if not write_file(test_name, test.ORIGINAL_SUFFIX, orig):
+        return False
+
+    rdbl_doc = readability.Document(orig)
+    summary = rdbl_doc.summary()
+    if not write_file(test_name, test.READABLE_SUFFIX, summary.html):
+        return False
+
+    return True
+
+USAGE = '''
+usage: %s <url> <test name> <test description>
+'''
+
+def usage(prog_name):
+    print(USAGE % prog_name)
+
+def main():
+    if len(sys.argv) != 4:
+        usage(sys.argv[0])
+        return
+    url = sys.argv[1]
+    test_name = sys.argv[2]
+    test_description = sys.argv[3]
+    result = gen_test(url, test_name, test_description)
+    if not result:
+        print('test was not fully generated')
+
+if __name__ == '__main__':
+    main()
@@ -1,3 +1,14 @@
+"""
+This module provides a regression test for results of running the readability
+algorithm on a variety of different real-world examples.  For each page in the
+test suite, a benchmark was captured that represents the current readability
+results.  Note that these are not necessarily ideal results, just the ones used
+as a benchmark.
+
+This allows you to tweak and change the readability algorithm and see how it
+changes existing results, hopefully for the better.
+
+"""
 import lxml.html
 import lxml.html.diff
 import os
@@ -25,11 +36,14 @@
 
 class ReadabilityTest:
 
-    def __init__(self, dir_path, enabled, name, desc, orig_path, rdbl_path):
+    def __init__(
+            self, dir_path, enabled, name, desc, notes, orig_path, rdbl_path
+            ):
         self.dir_path = dir_path
         self.enabled = enabled
         self.name = name
         self.desc = desc
+        self.notes = notes
         self.orig_path = orig_path
         self.rdbl_path = rdbl_path
 
@@ -63,11 +77,16 @@ def make_readability_test(dir_path, name, spec_dict):
         enabled = spec_dict['enabled']
     else:
         enabled = True
+    if 'notes' in spec_dict:
+        notes = spec_dict['notes']
+    else:
+        notes = ''
     return ReadabilityTest(
             dir_path,
             enabled,
             name,
             spec_dict['test_description'],
+            notes,
             make_path(dir_path, name, ORIGINAL_SUFFIX),
             make_path(dir_path, name, READABLE_SUFFIX)
             )
@@ -99,8 +118,6 @@ def execute_test(test_data):
     else:
         doc = readability.Document(test_data.orig_html)
         summary = doc.summary()
-        benchmark_doc = (test_data.rdbl_html, 'benchmark')
-        result_doc = (summary.html, 'result')
         diff = lxml.html.diff.htmldiff(test_data.rdbl_html, summary.html)
         return ReadabilityTestResult(test_data, summary.html, diff)
 
@@ -139,12 +156,14 @@ class ResultSummary():
 
     def __init__(self, result):
         doc = lxml.html.fragment_fromstring(result.diff_html)
+
         insertions = doc.xpath('//ins')
         insertion_lengths = element_string_lengths(insertions)
-        deletions = doc.xpath('//del')
-        deletion_lengths = element_string_lengths(deletions)
         self.insertions = sum(insertion_lengths)
         self.insertion_blocks = len(insertions)
+
+        deletions = doc.xpath('//del')
+        deletion_lengths = element_string_lengths(deletions)
         self.deletions = sum(deletion_lengths)
         self.deletion_blocks = len(deletions)
         pass
@@ -169,15 +188,17 @@ def output(suffix):
                     B.A('result', href = output(RESULT_SUFFIX)),
                     ' ',
                     B.A('diff', href = output(DIFF_SUFFIX))
-                    )
+                    ),
+                B.TD(test.notes)
                 )
     else:
         return B.TR(
                 B.CLASS('skipped'),
                 B.TD('%s (SKIPPED)' % test.name),
                 B.TD('N/A'),
                 B.TD('N/A'),
-                B.TD('N/A')
+                B.TD('N/A'),
+                B.TD(test.notes)
                 )
 
 
@@ -187,7 +208,8 @@ def make_summary_doc(tests_w_results):
                 B.TH('Test Name'),
                 B.TH('Inserted (in # of blocks)'),
                 B.TH('Deleted (in # of blocks)'),
-                B.TH('Links')
+                B.TH('Links'),
+                B.TH('Notes')
                 )
             )
     for (test, result) in tests_w_results:
@@ -253,7 +275,7 @@ def write_summary(path, tests_w_results):
 
 def add_css(doc):
     style = B.STYLE(CSS, type = 'text/css')
-    head = B.HEAD(style)
+    head = B.HEAD(style, content = 'text/html; charset=utf-8')
     doc.insert(0, head)