diff --git a/scripts/unicode.py b/scripts/unicode.py index 79d7c95..693060a 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -20,7 +20,7 @@ # Since this should not require frequent updates, we just store this # out-of-line and check the unicode.rs file into git. -import fileinput, re, os, sys, operator +import fileinput, re, os, sys preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at @@ -59,7 +59,7 @@ def is_surrogate(n): def fetch(f): if not os.path.exists(os.path.basename(f)): - os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" + os.system("curl -O http://www.unicode.org/Public/9.0.0/ucd/%s" % f) if not os.path.exists(os.path.basename(f)): @@ -80,7 +80,7 @@ def load_gencats(f): if is_surrogate(cp): continue if range_start >= 0: - for i in xrange(range_start, cp): + for i in range(range_start, cp): udict[i] = data; range_start = -1; if data[1].endswith(", First>"): @@ -150,8 +150,8 @@ def format_table_content(f, content, indent): def load_properties(f, interestingprops): fetch(f) props = {} - re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") - re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") + re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") for line in fileinput.input(os.path.basename(f)): prop = None @@ -309,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name): # download and parse all the data fetch("ReadMe.txt") with open("ReadMe.txt") as readme: - pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" unicode_version = re.search(pattern, readme.read()).groups() rf.write(""" /// The version of [Unicode](http://www.unicode.org/) @@ -342,7 +342,7 @@ def emit_break_module(f, break_table, break_cats, name): for cat in grapheme_cats: grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]]) grapheme_table.sort(key=lambda w: w[0]) - emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme") + emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme") rf.write("\n") word_cats = load_properties("auxiliary/WordBreakProperty.txt", []) @@ -350,11 +350,11 @@ def emit_break_module(f, break_table, break_cats, name): for cat in word_cats: word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]]) word_table.sort(key=lambda w: w[0]) - emit_break_module(rf, word_table, word_cats.keys(), "word") + emit_break_module(rf, word_table, list(word_cats.keys()), "word") sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", []) sentence_table = [] for cat in sentence_cats: sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]]) sentence_table.sort(key=lambda w: w[0]) - emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence") + emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence") \ No newline at end of file diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py index 21cb1fa..2b22ebc 100755 --- a/scripts/unicode_gen_breaktests.py +++ b/scripts/unicode_gen_breaktests.py @@ -17,23 +17,23 @@ # # Since this should not require frequent updates, we just store this # out-of-line and check the unicode.rs file into git. +from __future__ import print_function import unicode, re, os, fileinput def load_test_data(f, optsplit=[]): - outls = [] - testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$") + testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$") unicode.fetch(f) data = [] for line in fileinput.input(os.path.basename(f)): # lines that include a test start with the ÷ character - if len(line) < 2 or line[0:2] != '÷': + if len(line) < 2 or not line.startswith('÷'): continue m = testRe1.match(line) if not m: - print "error: no match on line where test was expected: %s" % line + print("error: no match on line where test was expected: %s" % line) continue # process the characters in this test case @@ -48,9 +48,9 @@ def load_test_data(f, optsplit=[]): # make sure that we have break info for each break! assert len(chars) - 1 == len(info) - outls.append((chars, info)) + data.append((chars, info)) - return outls + return data def process_split_info(s, c, o): outcs = [] @@ -59,7 +59,7 @@ def process_split_info(s, c, o): # are we on a × or a ÷? isX = False - if s[0:2] == '×': + if s.startswith('×'): isX = True # find each instance of '(÷|×) [x.y] ' @@ -81,10 +81,10 @@ def process_split_info(s, c, o): idx = 1 while idx < len(s): - if s[idx:idx+2] == '×': + if s[idx:].startswith('×'): isX = True break - if s[idx:idx+2] == '÷': + if s[idx:].startswith('÷'): isX = False break idx += 1 @@ -172,7 +172,7 @@ def create_grapheme_data(f): stype = "&'static [(&'static str, &'static [&'static str])]" dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]" f.write(" // official Unicode test data\n") - f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n") + f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n") unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True) unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True) @@ -187,7 +187,7 @@ def create_words_data(f): wtype = "&'static [(&'static str, &'static [&'static str])]" f.write(" // official Unicode test data\n") - f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n") + f.write(" // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt\n") unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True) def create_sentence_data(f): diff --git a/src/testdata.rs b/src/testdata.rs index 7ce6b82..f6ecb69 100644 --- a/src/testdata.rs +++ b/src/testdata.rs @@ -1,4 +1,4 @@ -// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -12,7 +12,7 @@ #![allow(missing_docs, non_upper_case_globals, non_snake_case)] // official Unicode test data - // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt + // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.txt pub const TEST_SAME: &'static [(&'static str, &'static [&'static str])] = &[ ("\u{20}\u{20}", &["\u{20}", "\u{20}"]), ("\u{20}\u{308}\u{20}", &["\u{20}\u{308}", "\u{20}"]), ("\u{20}\u{d}", &["\u{20}", "\u{d}"]), ("\u{20}\u{308}\u{d}", &["\u{20}\u{308}", @@ -516,7 +516,7 @@ ]; // official Unicode test data - // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt + // http://www.unicode.org/Public/9.0.0/ucd/auxiliary/WordBreakTest.txt pub const TEST_WORD: &'static [(&'static str, &'static [&'static str])] = &[ ("\u{1}\u{1}", &["\u{1}", "\u{1}"]), ("\u{1}\u{308}\u{1}", &["\u{1}\u{308}", "\u{1}"]), ("\u{1}\u{d}", &["\u{1}", "\u{d}"]), ("\u{1}\u{308}\u{d}", &["\u{1}\u{308}", "\u{d}"]),