8000 Merge pull request #9 from unicode-rs/is_combining_mark · froydnj/unicode-normalization@cd9a45e · GitHub
[go: up one dir, main page]

Skip to content

Commit cd9a45e

Browse files
committed
Merge pull request unicode-rs#9 from unicode-rs/is_combining_mark
Add a `char::is_combining_mark` function.
2 parents 5ea0cec + 5d0443b commit cd9a45e

File tree

4 files changed

+157
-5
lines changed

4 files changed

+157
-5
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "unicode-normalization"
4-
version = "0.1.1"
4+
version = "0.1.2"
55
authors = ["kwantam <kwantam@gmail.com>"]
66

77
homepage = "https://github.com/unicode-rs/unicode-normalization"

scripts/unicode.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def load_unicode_data(f):
6969
combines = {}
7070
canon_decomp = {}
7171
compat_decomp = {}
72+
general_category_mark = []
7273

7374
udict = {};
7475
range_start = -1;
@@ -112,9 +113,13 @@ def load_unicode_data(f):
112113
combines[combine] = []
113114
combines[combine].append(code)
114115

116+
if 'M' in [gencat] + expanded_categories.get(gencat, []):
117+
general_category_mark.append(code)
118+
general_category_mark = group_cat(general_category_mark)
119+
115120
combines = to_combines(group_cats(combines))
116121

117-
return (canon_decomp, compat_decomp, combines)
122+
return (canon_decomp, compat_decomp, combines, general_category_mark)
118123

119124
def group_cats(cats):
120125
cats_out = {}
@@ -225,7 +230,7 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
225230
format_table_content(f, data, 8)
226231
f.write("\n ];\n\n")
227232

228-
def emit_norm_module(f, canon, compat, combine, norm_props):
233+
def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mark):
229234
canon_keys = canon.keys()
230235
canon_keys.sort()
231236

@@ -309,6 +314,31 @@ def comp_pfun(char):
309314
+ " }\n")
310315

311316
f.write("""
317+
fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool {
318+
use std::cmp::Ordering::{Equal, Less, Greater};
319+
r.binary_search_by(|&(lo, hi)| {
320+
if lo <= c && c <= hi {
321+
Equal
322+
} else if hi < c {
323+
Less
324+
} else {
325+
Greater
326+
}
327+
})
328+
.is_ok()
329+
}
330+
331+
/// Return whether the given character is a combining mark (`General_Category=Mark`)
332+
pub fn is_combining_mark(c: char) -> bool {
333+
bsearch_range_table(c, general_category_mark)
334+
}
335+
336+
""")
337+
338+
emit_table(f, "general_category_mark", combine, "&'static [(char, char)]", is_pub=False,
339+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
340+
341+
f.write("""
312342
}
313343
314344
""")
@@ -332,9 +362,11 @@ def comp_pfun(char):
332362
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
333363
334364
""" % unicode_version)
335-
(canon_decomp, compat_decomp, combines) = load_unicode_data("UnicodeData.txt")
365+
(canon_decomp, compat_decomp, combines, general_category_mark) = \
366+
load_unicode_data("UnicodeData.txt")
336367
norm_props = load_properties("DerivedNormalizationProps.txt",
337368
["Full_Composition_Exclusion"])
338369

339370
# normalizations and conversions module
340-
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
371+
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props,
372+
general_category_mark)

src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ pub mod char {
6262

6363
/// Look up the canonical combining class of a character.
6464
pub use tables::normalization::canonical_combining_class;
65+
66+
/// Return whether the given character is a combining mark (`General_Category=Mark`)
67+
pub use tables::normalization::is_combining_mark;
6568
}
6669

6770

src/tables.rs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2717,5 +2717,122 @@ pub mod normalization {
27172717
bsearch_range_value_table(c, combining_class_table)
27182718
}
27192719

2720+
fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool {
2721+
use std::cmp::Ordering::{Equal, Less, Greater};
2722+
r.binary_search_by(|&(lo, hi)| {
2723+
if lo <= c && c <= hi {
2724+
Equal
2725+
} else if hi < c {
2726+
Less
2727+
} else {
2728+
Greater
2729+
}
2730+
})
2731+
.is_ok()
2732+
}
2733+
2734+
/// Return whether the given character is a combining mark (`General_Category=Mark`)
2735+
pub fn is_combining_mark(c: char) -> bool {
2736+
bsearch_range_table(c, general_category_mark)
2737+
}
2738+
2739+
const general_category_mark: &'static [(char, char)] = &[
2740+
('\u{300}', '\u{314}'), ('\u{315}', '\u{315}'), ('\u{316}', '\u{319}'), ('\u{31a}',
2741+
'\u{31a}'), ('\u{31b}', '\u{31b}'), ('\u{31c}', '\u{320}'), ('\u{321}', '\u{322}'),
2742+
('\u{323}', '\u{326}'), ('\u{327}', '\u{328}'), ('\u{329}', '\u{333}'), ('\u{334}',
2743+
'\u{338}'), ('\u{339}', '\u{33c}'), ('\u{33d}', '\u{344}'), ('\u{345}', '\u{345}'),
2744+
('\u{346}', '\u{346}'), ('\u{347}', '\u{349}'), ('\u{34a}', '\u{34c}'), ('\u{34d}',
2745+
'\u{34e}'), ('\u{350}', '\u{352}'), ('\u{353}', '\u{356}'), ('\u{357}', '\u{357}'),
2746+
('\u{358}', '\u{358}'), ('\u{359}', '\u{35a}'), ('\u{35b}', '\u{35b}'), ('\u{35c}',
2747+
'\u{35c}'), ('\u{35d}', '\u{35e}'), ('\u{35f}', '\u{35f}'), ('\u{360}', '\u{361}'),
2748+
('\u{362}', '\u{362}'), ('\u{363}', '\u{36f}'), ('\u{483}', '\u{487}'), ('\u{591}',
2749+
'\u{591}'), ('\u{592}', '\u{595}'), ('\u{596}', '\u{596}'), ('\u{597}', '\u{599}'),
2750+
('\u{59a}', '\u{59a}'), ('\u{59b}', '\u{59b}'), ('\u{59c}', '\u{5a1}'), ('\u{5a2}',
2751+
'\u{5a7}'), ('\u{5a8}', '\u{5a9}'), ('\u{5aa}', '\u{5aa}'), ('\u{5ab}', '\u{5ac}'),
2752+
('\u{5ad}', '\u{5ad}'), ('\u{5ae}', '\u{5ae}'), ('\u{5af}', '\u{5af}'), ('\u{5b0}',
2753+
'\u{5b0}'), ('\u{5b1}', '\u{5b1}'), ('\u{5b2}', '\u{5b2}'), ('\u{5b3}', '\u{5b3}'),
2754+
('\u{5b4}', '\u{5b4}'), ('\u{5b5}', '\u{5b5}'), ('\u{5b6}', '\u{5b6}'), ('\u{5b7}',
2755+
'\u{5b7}'), ('\u{5b8}', '\u{5b8}'), ('\u{5b9}', '\u{5ba}'), ('\u{5bb}', '\u{5bb}'),
2756+
('\u{5bc}', '\u{5bc}'), ('\u{5bd}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}',
2757+
'\u{5c1}'), ('\u{5c2}', '\u{5c2}'), ('\u{5c4}', '\u{5c4}'), ('\u{5c5}', '\u{5c5}'),
2758+
('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{617}'), ('\u{618}', '\u{618}'), ('\u{619}',
2759+
'\u{619}'), ('\u{61a}', '\u{61a}'), ('\u{64b}', '\u{64b}'), ('\u{64c}', '\u{64c}'),
2760+
('\u{64d}', '\u{64d}'), ('\u{64e}', '\u{64e}'), ('\u{64f}', '\u{64f}'), ('\u{650}',
2761+
'\u{650}'), ('\u{651}', '\u{651}'), ('\u{652}', '\u{652}'), ('\u{653}', '\u{654}'),
2762+
('\u{655}', '\u{656}'), ('\u{657}', '\u{65b}'), ('\u{65c}', '\u{65c}'), ('\u{65d}',
2763+
'\u{65e}'), ('\u{65f}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'),
2764+
('\u{6df}', '\u{6e2}'), ('\u{6e3}', '\u{6e3}'), ('\u{6e4}', '\u{6e4}'), ('\u{6e7}',
2765+
'\u{6e8}'), ('\u{6ea}', '\u{6ea}'), ('\u{6eb}', '\u{6ec}'), ('\u{6ed}', '\u{6ed}'),
2766+
( B3F5 9;\u{711}', '\u{711}'), ('\u{730}', '\u{730}'), ('\u{731}', '\u{731}'), ('\u{732}',
2767+
'\u{733}'), ('\u{734}', '\u{734}'), ('\u{735}', '\u{736}'), ('\u{737}', '\u{739}'),
2768+
('\u{73a}', '\u{73a}'), ('\u{73b}', '\u{73c}'), ('\u{73d}', '\u{73d}'), ('\u{73e}',
2769+
'\u{73e}'), ('\u{73f}', '\u{741}'), ('\u{742}', '\u{742}'), ('\u{743}', '\u{743}'),
2770+
('\u{744}', '\u{744}'), ('\u{745}', '\u{745}'), ('\u{746}', '\u{746}'), ('\u{747}',
2771+
'\u{747}'), ('\u{748}', '\u{748}'), ('\u{749}', '\u{74a}'), ('\u{7eb}', '\u{7f1}'),
2772+
('\u{7f2}', '\u{7f2}'), ('\u{7f3}', '\u{7f3}'), ('\u{816}', '\u{819}'), ('\u{81b}',
2773+
'\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'),
2774+
('\u{8e3}', '\u{8e3}'), ('\u{8e4}', '\u{8e5}'), ('\u{8e6}', '\u{8e6}'), ('\u{8e7}',
2775+
'\u{8e8}'), ('\u{8e9}', '\u{8e9}'), ('\u{8ea}', '\u{8ec}'), ('\u{8ed}', '\u{8ef}'),
2776+
('\u{8f0}', '\u{8f0}'), ('\u{8f1}', '\u{8f1}'), ('\u{8f2}', '\u{8f2}'), ('\u{8f3}',
2777+
'\u{8f5}'), ('\u{8f6}', '\u{8f6}'), ('\u{8f7}', '\u{8f8}'), ('\u{8f9}', '\u{8fa}'),
2778+
('\u{8fb}', '\u{8ff}'), ('\u{93c}', '\u{93c}'), ('\u{94d}', '\u{94d}'), ('\u{951}',
2779+
'\u{951}'), ('\u{952}', '\u{952}'), ('\u{953}', '\u{954}'), ('\u{9bc}', '\u{9bc}'),
2780+
('\u{9cd}', '\u{9cd}'), ('\u{a3c}', '\u{a3c}'), ('\u{a4d}', '\u{a4d}'), ('\u{abc}',
2781+
'\u{abc}'), ('\u{acd}', '\u{acd}'), ('\u{b3c}', '\u{b3c}'), ('\u{b4d}', '\u{b4d}'),
2782+
('\u{bcd}', '\u{bcd}'), ('\u{c4d}', '\u{c4d}'), ('\u{c55}', '\u{c55}'), ('\u{c56}',
2783+
'\u{c56}'), ('\u{cbc}', '\u{cbc}'), ('\u{ccd}', '\u{ccd}'), ('\u{d4d}', '\u{d4d}'),
2784+
('\u{dca}', '\u{dca}'), ('\u{e38}', '\u{e39}'), ('\u{e3a}', '\u{e3a}'), ('\u{e48}',
2785+
'\u{e4b}'), ('\u{eb8}', '\u{eb9}'), ('\u{ec8}', '\u{ecb}'), ('\u{f18}', '\u{f19}'),
2786+
('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('\u{f71}',
2787+
'\u{f71}'), ('\u{f72}', '\u{f72}'), ('\u{f74}', '\u{f74}'), ('\u{f7a}', '\u{f7d}'),
2788+
('\u{f80}', '\u{f80}'), ('\u{f82}', '\u{f83}'), ('\u{f84}', '\u{f84}'), ('\u{f86}',
2789+
'\u{f87}'), ('\u{fc6}', '\u{fc6}'), ('\u{1037}', '\u{1037}'), ('\u{1039}', '\u{103a}'),
2790+
('\u{108d}', '\u{108d}'), ('\u{135d}', '\u{135f}'), ('\u{1714}', '\u{1714}'), ('\u{1734}',
2791+
'\u{1734}'), ('\u{17d2}', '\u{17d2}'), ('\u{17dd}', '\u{17dd}'), ('\u{18a9}', '\u{18a9}'),
2792+
('\u{1939}', '\u{1939}'), ('\u{193a}', '\u{193a}'), ('\u{193b}', '\u{193b}'), ('\u{1a17}',
2793+
'\u{1a17}'), ('\u{1a18}', '\u{1a18}'), ('\u{1a60}', '\u{1a60}'), ('\u{1a75}', '\u{1a7c}'),
2794+
('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1ab4}'), ('\u{1ab5}', '\u{1aba}'), ('\u{1abb}',
2795+
'\u{1abc}'), ('\u{1abd}', '\u{1abd}'), ('\u{1b34}', '\u{1b34}'), ('\u{1b44}', '\u{1b44}'),
2796+
('\u{1b6b}', '\u{1b6b}'), ('\u{1b6c}', '\u{1b6c}'), ('\u{1b6d}', '\u{1b73}'), ('\u{1baa}',
2797+
'\u{1bab}'), ('\u{1be6}', '\u{1be6}'), ('\u{1bf2}', '\u{1bf3}'), ('\u{1c37}', '\u{1c37}'),
2798+
('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1cd4}'), ('\u{1cd5}', '\u{1cd9}'), ('\u{1cda}',
2799+
'\u{1cdb}'), ('\u{1cdc}', '\u{1cdf}'), ('\u{1ce0}', '\u{1ce0}'), ('\u{1ce2}', '\u{1ce8}'),
2800+
('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{1dc0}',
2801+
'\u{1dc1}'), ('\u{1dc2}', '\u{1dc2}'), ('\u{1dc3}', '\u{1dc9}'), ('\u{1dca}', '\u{1dca}'),
2802+
('\u{1dcb}', '\u{1dcc}'), ('\u{1dcd}', '\u{1dcd}'), ('\u{1dce}', '\u{1dce}'), ('\u{1dcf}',
2803+
'\u{1dcf}'), ('\u{1dd0}', '\u{1dd0}'), ('\u{1dd1}', '\u{1df5}'), ('\u{1dfc}', '\u{1dfc}'),
2804+
('\u{1dfd}', '\u{1dfd}'), ('\u{1dfe}', '\u{1dfe}'), ('\u{1dff}', '\u{1dff}'), ('\u{20d0}',
2805+
'\u{20d1}'), ('\u{20d2}', '\u{20d3}'), ('\u{20d4}', '\u{20d7}'), ('\u{20d8}', '\u{20da}'),
2806+
('\u{20db}', '\u{20dc}'), ('\u{20e1}', '\u{20e1}'), ('\u{20e5}', '\u{20e6}'), ('\u{20e7}',
2807+
'\u{20e7}'), ('\u{20e8}', '\u{20e8}'), ('\u{20e9}', '\u{20e9}'), ('\u{20ea}', '\u{20eb}'),
2808+
('\u{20ec}', '\u{20ef}'), ('\u{20f0}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}',
2809+
'\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302a}'), ('\u{302b}', '\u{302b}'),
2810+
('\u{302c}', '\u{302c}'), ('\u{302d}', '\u{302d}'), ('\u{302e}', '\u{302f}'), ('\u{3099}',
2811+
'\u{309a}'), ('\u{a66f}', '\u{a66f}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'),
2812+
('\u{a6f0}', '\u{a6f1}'), ('\u{a806}', '\u{a806}'), ('\u{a8c4}', '\u{a8c4}'), ('\u{a8e0}',
2813+
'\u{a8f1}'), ('\u{a92b}', '\u{a92d}'), ('\u{a953}', '\u{a953}'), ('\u{a9b3}', '\u{a9b3}'),
2814+
('\u{a9c0}', '\u{a9c0}'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab3}'), ('\u{aab4}',
2815+
'\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'),
2816+
('\u{aaf6}', '\u{aaf6}'), ('\u{abed}', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe20}',
2817+
'\u{fe26}'), ('\u{fe27}', '\u{fe2d}'), ('\u{fe2e}', '\u{fe2f}'), ('\u{101fd}', '\u{101fd}'),
2818+
('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a0d}', '\u{10a0d}'),
2819+
('\u{10a0f}', '\u{10a0f}'), ('\u{10a38}', '\u{10a38}'), ('\u{10a39}', '\u{10a39}'),
2820+
('\u{10a3a}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae5}'),
2821+
('\u{10ae6}', '\u{10ae6}'), ('\u{11046}', '\u{11046}'), ('\u{1107f}', '\u{1107f}'),
2822+
('\u{110b9}', '\u{110b9}'), ('\u{110ba}', '\u{110ba}'), ('\u{11100}', '\u{11102}'),
2823+
('\u{11133}', '\u{11134}'), ('\u{11173}', '\u{11173}'), ('\u{111c0}', '\u{111c0}'),
2824+
('\u{111ca}', '\u{111ca}'), ('\u{11235}', '\u{11235}'), ('\u{11236}', '\u{11236}'),
2825+
('\u{112e9}', '\u{112e9}'), ('\u{112ea}', '\u{112ea}'), ('\u{1133c}', '\u{1133c}'),
2826+
('\u{1134d}', '\u{1134d}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'),
2827+
('\u{114c2}', '\u{114c2}'), ('\u{114c3}', '\u{114c3}'), ('\u{115bf}', '\u{115bf}'),
2828+
('\u{115c0}', '\u{115c0}'), ('\u{1163f}', '\u{1163f}'), ('\u{116b6}', '\u{116b6}'),
2829+
('\u{116b7}', '\u{116b7}'), ('\u{1172b}', '\u{1172b}'), ('\u{16af0}', '\u{16af4}'),
2830+
('\u{16b30}', '\u{16b36}'), ('\u{1bc9e}', '\u{1bc9e}'), ('\u{1d165}', '\u{1d166}'),
2831+
('\u{1d167}', '\u{1d169}'), ('\u{1d16d}', '\u{1d16d}'), ('\u{1d16e}', '\u{1d172}'),
2832+
('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d189}'), ('\u{1d18a}', '\u{1d18b}'),
2833+
('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1e8d0}', '\u{1e8d6}')
2834+
];
2835+
2836+
27202837
}
27212838

0 commit comments

Comments
 (0)
0