@@ -72,9 +72,9 @@ def __init__(self):
72
72
self .canon_comp = self ._compute_canonical_comp ()
73
73
self .canon_fully_decomp , self .compat_fully_decomp = self ._compute_fully_decomposed ()
74
74
75
- self .svar_decomp = {}
76
- self .svar_fully_decomp = {}
77
- self ._load_standardized_variants ()
75
+ self .cjk_compat_variants_decomp = {}
76
+ self .cjk_compat_variants_fully_decomp = {}
77
+ self ._load_cjk_compat_ideograph_variants ()
78
78
79
79
def stats (name , table ):
80
80
count = sum (len (v ) for v in table .values ())
@@ -83,10 +83,10 @@ def stats(name, table):
83
83
print ("Decomposition table stats:" )
84
84
stats ("Canonical decomp" , self .canon_decomp )
85
85
stats ("Compatible decomp" , self .compat_decomp )
86
- stats ("Standardized Variants" , self .svar_decomp )
86
+ stats ("CJK Compat Variants" , self .cjk_compat_variants_decomp )
87
87
stats ("Canonical fully decomp" , self .canon_fully_decomp )
88
88
stats ("Compatible fully decomp" , self .compat_fully_decomp )
89
- stats ("Standardized Variants" , self .svar_fully_decomp )
89
+ stats ("CJK Compat Variants" , self .cjk_compat_variants_fully_decomp )
90
90
91
91
self .ss_leading , self .ss_trailing = self ._compute_stream_safe_tables ()
92
92
@@ -122,38 +122,41 @@ def _load_unicode_data(self):
122
122
if category == 'M' or 'M' in expanded_categories .get (category , []):
123
123
self .general_category_mark .append (char_int )
124
124
125
- def _load_standardized_variants (self ):
125
+ def _load_cjk_compat_ideograph_variants (self ):
126
126
for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
127
127
strip_comments = line .split ('#' , 1 )[0 ].strip ()
128
128
if not strip_comments :
129
129
continue
130
130
131
- pieces = strip_comments .split (';' )
132
- assert len (pieces ) == 3
133
-
134
- variation_sequence , description , differences = pieces [0 ], pieces [1 ].strip (), pieces [2 ]
131
+ variation_sequence , description , differences = strip_comments .split (';' )
132
+ description = description .strip ()
135
133
136
134
# Don't use variations that only apply in particular shaping environments.
137
135
if differences :
138
136
continue
139
137
140
138
# Look for entries where the description field is a codepoint name.
141
- if description in self .name_to_char_int :
142
- char_int = self .name_to_char_int [description ]
143
-
144
- assert not char_int in self .combining_classes , "Unexpected: standardized variant with a combining class"
145
- assert not char_int in self .compat_decomp , "Unexpected: standardized variant and compatibility decomposition"
146
- assert len (self .canon_decomp [char_int ]) == 1 , "Unexpected: standardized variant and non-singleton canonical decomposition"
147
- # If we ever need to handle Hangul here, we'll need to handle it separately.
148
- assert not (S_BASE <= char_int < S_BASE + S_COUNT )
149
-
150
- standardized_variant_parts = [int (c , 16 ) for c in variation_sequence .split ()]
151
- for c in standardized_variant_parts :
152
- #assert not never_composes(c) TODO: Re-enable this once #67 lands.
153
- assert not c in self .canon_decomp , "Unexpected: standardized variant is unnormalized (canon)"
154
- assert not c in self .compat_decomp , "Unexpected: standardized variant is unnormalized (compat)"
155
- self .svar_decomp [char_int ] = standardized_variant_parts
156
- self .svar_fully_decomp [char_int ] = standardized_variant_parts
139
+ if description not in self .name_to_char_int :
140
+ continue
141
+
142
+ # Only consider the CJK Compatibility Ideographs.
143
+ if not description .startswith ('CJK COMPATIBILITY IDEOGRAPH-' ):
144
+ continue
145
+
146
+ char_int = self .name_to_char_int [description ]
147
+
148
+ assert not char_int in self .combining_classes , "Unexpected: CJK compat variant with a combining class"
149
+ assert not char_int in self .compat_decomp , "Unexpected: CJK compat variant and compatibility decomposition"
150
+ assert len (self .canon_decomp [char_int ]) == 1 , "Unexpected: CJK compat variant and non-singleton canonical decomposition"
151
+ # If we ever need to handle Hangul here, we'll need to handle it separately.
152
+ assert not (S_BASE <= char_int < S_BASE + S_COUNT )
153
+
154
+ cjk_compat_variant_parts = [int (c , 16 ) for c in variation_sequence .split ()]
155
+ for c in cjk_compat_variant_parts :
156
+ assert not c in self .canon_decomp , "Unexpected: CJK compat variant is unnormalized (canon)"
157
+ assert not c in self .compat_decomp , "Unexpected: CJK compat variant is unnormalized (compat)"
158
+ self .cjk_compat_variants_decomp [char_int ] = cjk_compat_variant_parts
159
+ self .cjk_compat_variants_fully_decomp [char_int ] = cjk_compat_variant_parts
157
160
158
161
def _load_norm_props (self ):
159
162
props = collections .defaultdict (list )
@@ -364,8 +367,8 @@ def gen_composition_table(canon_comp, out):
364
367
out .write (" }\n " )
365
368
out .write ("}\n " )
366
369
367
- def gen_decomposition_tables (canon_decomp , compat_decomp , svar_decomp , out ):
368
- tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' ), (svar_decomp , 'svar ' )]
370
+ def gen_decomposition_tables (canon_decomp , compat_decomp , cjk_compat_variants_decomp , out ):
371
+ tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' ), (cjk_compat_variants_decomp , 'cjk_compat_variants ' )]
369
372
for table , name in tables :
370
373
gen_mph_data (name + '_decomposed' , table , "(u32, &'static [char])" ,
371
374
lambda k : "(0x{:x}, &[{}])" .format (k ,
@@ -535,7 +538,7 @@ def minimal_perfect_hash(d):
535
538
gen_composition_table (data .canon_comp , out )
536
539
out .write ("\n " )
537
540
538
- gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , data .svar_fully_decomp , out )
541
+ gen_decomposition_tables (data .canon_fully_decomp , data .compat_fully_decomp , data .cjk_compat_variants_fully_decomp , out )
539
542
540
543
gen_combining_mark (data .general_category_mark , out )
541
544
out .write ("\n " )
0 commit comments