diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 57492f1..c28d0e6 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -11,7 +11,7 @@ env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 RUSTFLAGS: -D warnings - RUSTDOCFLAGS: -D warnings --cfg docsrs + RUSTDOCFLAGS: -D warnings jobs: build: @@ -43,6 +43,8 @@ jobs: run: cd $(find target/package/ -maxdepth 1 -mindepth 1 -type d) && cargo test --no-default-features - name: Build docs if: matrix.rust == 'nightly' + env: + RUSTDOCFLAGS: -D warnings --cfg docsrs run: cargo doc --all-features --verbose - name: Check formatting if: matrix.rust == 'stable' diff --git a/Cargo.toml b/Cargo.toml index 3545601..bf1a0ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,4 +40,8 @@ features = ["alloc"] [features] default = ["std"] +ks_x_1026-1 = [] std = [] + +[package.metadata.docs.rs] +rustc-args = ["--cfg", "feature=\"ks_x_1026-1\""] diff --git a/README.md b/README.md index 7d10e4d..5e169ed 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ fn main() { ## crates.io -You can use this package in your project by adding the following -to your `Cargo.toml`: +You can use this package in your project by adding the following to your +`Cargo.toml`: ```toml [dependencies] @@ -36,4 +36,15 @@ unicode-normalization = "0.1.23" ## `no_std` + `alloc` support -This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`. +This crate is completely `no_std` + `alloc` compatible. This can be enabled by +disabling the `std` feature, i.e. specifying `default-features = false` for this +crate on your `Cargo.toml`. + +## KS X 1026-1 + +Korean Standard KS X 1026-1 ([Korean](https://standard.go.kr/KSCI/standardIntro/getStandardSearchView.do?ksNo=KSX1026-1), +[English](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf)) is an ROK government +standard that corrects some defects and makes some changes to the Unicode NFC, +NFKC, and NFKD normalization forms for certain Korean characters. The +`ks_x_1026-1` crate feature (disabled by default) adds methods to support these +alternate normalizations. diff --git a/src/ks_x_1026_1.rs b/src/ks_x_1026_1.rs new file mode 100644 index 0000000..dad327e --- /dev/null +++ b/src/ks_x_1026_1.rs @@ -0,0 +1,233 @@ +//! Annex B + +use core::{ + convert::{TryFrom, TryInto}, + iter::FusedIterator, +}; + +use tinyvec::ArrayVec; + +// § B.1.1 + +use crate::normalize::hangul_constants::{ + L_BASE, L_LAST, N_COUNT, S_BASE, S_COUNT, T_BASE, T_COUNT, T_LAST, V_BASE, V_LAST, +}; + +// § B.1.2 + +fn is_old_jongseong(t: char) -> bool { + match t { + '\u{11C3}'..='\u{11FF}' | '\u{D7CB}'..='\u{D7FB}' => true, + _ => false, + } +} + +/// Iterator that decomposes modern Hangul LV syllables immediately followed by old Hangul T jamo +/// into a 3-character L V T sequences, as specified in KS X 1026-1 annex B.1.5. +#[derive(Clone, Debug)] +pub struct RecomposeHangul { + /// Medial vowel of a decomposed LV syllable + v: Option, + /// Character yielded by inner iterator in last call to its `next()` + last: Option, + inner: I, +} + +impl> Iterator for RecomposeHangul { + type Item = char; + + fn next(&mut self) -> Option { + if let Some(v) = self.v { + // If an LV syllable was decomposed in the last call to `next`, + // yield its medial vowel. + self.v = None; + Some(v) + } else { + let prev = self.last; + self.last = self.inner.next(); + + if let (Some(prev), Some(next)) = (prev, self.last) { + let s_index = u32::from(prev).wrapping_sub(S_BASE); + if s_index < S_COUNT && s_index % T_COUNT == 0 && is_old_jongseong(next) { + // We have an LV syllable followed by an old jongseong, decompose into L V + let l: char = (L_BASE + s_index / N_COUNT).try_into().unwrap(); + self.v = Some((V_BASE + (s_index % N_COUNT) / T_COUNT).try_into().unwrap()); + return Some(l); + } + } + + prev + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (inner_lo, inner_hi) = self.inner.size_hint(); + let add_factor: usize = self.v.map_or(0, |_| 1) + self.last.map_or(0, |_| 1); + ( + inner_lo.saturating_add(add_factor), + inner_hi + .and_then(|h| h.checked_mul(2)) + .and_then(|h| h.checked_add(add_factor)), + ) + } +} + +impl + FusedIterator> FusedIterator for RecomposeHangul {} + +impl> RecomposeHangul { + #[inline] + pub(crate) fn new(mut iter: I) -> Self { + RecomposeHangul { + v: None, + last: iter.next(), + inner: iter, + } + } +} + +// B.2.1 + +static CP_JAMO: [char; 94] = [ + '\u{1100}', '\u{1101}', '\u{11AA}', '\u{1102}', '\u{11AC}', '\u{11AD}', '\u{1103}', '\u{1104}', + '\u{1105}', '\u{11B0}', '\u{11B1}', '\u{11B2}', '\u{11B3}', '\u{11B4}', '\u{11B5}', '\u{111A}', + '\u{1106}', '\u{1107}', '\u{1108}', '\u{1121}', '\u{1109}', '\u{110A}', '\u{110B}', '\u{110C}', + '\u{110D}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', '\u{1161}', '\u{1162}', + '\u{1163}', '\u{1164}', '\u{1165}', '\u{1166}', '\u{1167}', '\u{1168}', '\u{1169}', '\u{116A}', + '\u{116B}', '\u{116C}', '\u{116D}', '\u{116E}', '\u{116F}', '\u{1170}', '\u{1171}', '\u{1172}', + '\u{1173}', '\u{1174}', '\u{1175}', '\u{1160}', '\u{1114}', '\u{1115}', '\u{11C7}', '\u{11C8}', + '\u{11CC}', '\u{11CE}', '\u{11D3}', '\u{11D7}', '\u{11D9}', '\u{111C}', '\u{11DD}', '\u{11DF}', + '\u{111D}', '\u{111E}', '\u{1120}', '\u{1122}', '\u{1123}', '\u{1127}', '\u{1129}', '\u{112B}', + '\u{112C}', '\u{112D}', '\u{112E}', '\u{112F}', '\u{1132}', '\u{1136}', '\u{1140}', '\u{1147}', + '\u{114C}', '\u{11F1}', '\u{11F2}', '\u{1157}', '\u{1158}', '\u{1159}', '\u{1184}', '\u{1185}', + '\u{1188}', '\u{1191}', '\u{1192}', '\u{1194}', '\u{119E}', '\u{11A1}', +]; + +// § B.2.2 + +static HW_JAMO: [char; 64] = [ + '\u{1160}', '\u{1100}', '\u{1101}', '\u{11AA}', '\u{1102}', '\u{11AC}', '\u{11AD}', '\u{1103}', + '\u{1104}', '\u{1105}', '\u{11B0}', '\u{11B1}', '\u{11B2}', '\u{11B3}', '\u{11B4}', '\u{11B5}', + '\u{111A}', '\u{1106}', '\u{1107}', '\u{1108}', '\u{1121}', '\u{1109}', '\u{110A}', '\u{110B}', + '\u{110C}', '\u{110D}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', '\u{FFBF}', + '\u{FFC0}', '\u{FFC1}', '\u{1161}', '\u{1162}', '\u{1163}', '\u{1164}', '\u{1165}', '\u{1166}', + '\u{FFC8}', '\u{FFC9}', '\u{1167}', '\u{1168}', '\u{1169}', '\u{116A}', '\u{116B}', '\u{116C}', + '\u{FFD0}', '\u{FFD1}', '\u{116D}', '\u{116E}', '\u{116F}', '\u{1170}', '\u{1171}', '\u{1172}', + '\u{FFD8}', '\u{FFD9}', '\u{1173}', '\u{1174}', '\u{1175}', '\u{FFDD}', '\u{FFDE}', '\u{FFDF}', +]; + +// § B.2.3 + +static PC_JAMO: [char; 14] = [ + '\u{1100}', '\u{1102}', '\u{1103}', '\u{1105}', '\u{1106}', '\u{1107}', '\u{1109}', '\u{110B}', + '\u{110C}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', +]; + +// § B.2.4 + +/// Iterator that decomposes compatibility characters containing Hangul jamo +/// in a manner that avoids introducing new nonstandard jamo sequences, +/// as specified in KS X 1026-1 annex B.2.4. +#[derive(Clone, Debug)] +pub struct NormalizeJamoKdkc { + inner: I, + // Buffer for when a character normalizes into multiple. + // Characters are pushed to and popped from the end. + // Length 3 is sufficient, as the longest possible expansion + // is for a parenthesized choseong like U+3200, + // which expands into ['(', , '\u{1160}', ')'] (length 4). + // (There are no parenthesized jungseong or jongseong.) + buf: ArrayVec<[char; 3]>, +} + +impl> Iterator for NormalizeJamoKdkc { + type Item = char; + + fn next(&mut self) -> Option { + if let Some(c) = self.buf.pop() { + // Empty buffer before yielding from underlying iterator. + Some(c) + } else { + let ch = self.inner.next()?; + // Whether ch is a parenthesized Hangul letter + let mut pf = false; + + let uch: u32 = ch.into(); + let base_jamo: char = match uch { + // Hangul compatibility letter + 0x3131..=0x318E => CP_JAMO[usize::try_from(uch - 0x3131).unwrap()], + + // Parenthesized Hangul letter + 0x3200..=0x320D => { + pf = true; + self.buf.push(')'); + PC_JAMO[usize::try_from(uch - 0x3200).unwrap()] + } + + // Circled Hangul letter + 0x3260..=0x326D => PC_JAMO[usize::try_from(uch - 0x3260).unwrap()], + + // Halfwidth Hangul letter + 0xFFA0..=0xFFDF => HW_JAMO[usize::try_from(uch - 0xFFA0).unwrap()], + + _ => return Some(ch), + }; + + // Insert fillers + let first_ret: char = match base_jamo.into() { + // `base_jamo` is choseong, yield a jungseong filler after + L_BASE..=L_LAST => { + self.buf.push('\u{1160}'); + base_jamo + } + + // `base_jamo` is jungseong, yield a choseong filler before + V_BASE..=V_LAST => { + self.buf.push(base_jamo); + '\u{115F}' + } + + // `base_jamo` is jongseong, yield a choseong and a jungseong filler before + T_BASE..=T_LAST => { + self.buf.push(base_jamo); + self.buf.push('\u{1160}'); + '\u{115F}' + } + + _ => unreachable!("`base_jamo` shluld be a jamo, but is not"), + }; + + if pf { + // Parenthesized Hangul letter, yield open paren before + self.buf.push(first_ret); + Some('(') + } else { + Some(first_ret) + } + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (inner_lo, inner_hi) = self.inner.size_hint(); + let add_factor: usize = self.buf.len(); + ( + inner_lo.saturating_add(add_factor), + inner_hi + .and_then(|h| h.checked_mul(4)) // Why 4? See comment on `buf` field + .and_then(|h| h.checked_add(add_factor)), + ) + } +} + +impl + FusedIterator> FusedIterator for NormalizeJamoKdkc {} + +impl> NormalizeJamoKdkc { + #[inline] + pub(crate) fn new(iter: I) -> Self { + NormalizeJamoKdkc { + inner: iter, + buf: ArrayVec::new(), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 8cf4c4a..9a6f293 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,23 +36,36 @@ //! [dependencies] //! unicode-normalization = "0.1.20" //! ``` +//! +//! # KS X 1026-1 +//! +//! Korean Standard KS X 1026-1 ([Korean](https://standard.go.kr/KSCI/standardIntro/getStandardSearchView.do?ksNo=KSX1026-1), +//! [English](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf)) is an ROK government +//! standard that corrects some defects and makes some changes to the Unicode NFC, +//! NFKC, and NFKD normalization forms for certain Korean characters. The +//! `ks_x_1026-1` crate feature (disabled by default) adds methods to support these +//! alternate normalizations. #![deny(missing_docs, unsafe_code)] #![doc( html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" )] +#![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr(not(feature = "std"), no_std)] #[cfg(not(feature = "std"))] extern crate alloc; -#[cfg(feature = "std")] -extern crate core; - extern crate tinyvec; pub use crate::decompose::Decompositions; +#[cfg(feature = "ks_x_1026-1")] +#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] +pub use crate::ks_x_1026_1::NormalizeJamoKdkc; +#[cfg(feature = "ks_x_1026-1")] +#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] +pub use crate::ks_x_1026_1::RecomposeHangul; pub use crate::quick_check::{ is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick, @@ -60,17 +73,24 @@ pub use crate::quick_check::{ }; pub use crate::recompose::Recompositions; pub use crate::replace::Replacements; +pub use crate::standardize_korean_syllables::StandardizeKoreanSyllables; +#[cfg(feature = "ks_x_1026-1")] +#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] +pub use crate::standardize_korean_syllables::StandardizeKoreanSyllablesKsX1026_1; pub use crate::stream_safe::StreamSafe; pub use crate::tables::UNICODE_VERSION; use core::{option, str::Chars}; mod decompose; +#[cfg(feature = "ks_x_1026-1")] +mod ks_x_1026_1; mod lookups; mod normalize; mod perfect_hash; mod quick_check; mod recompose; mod replace; +mod standardize_korean_syllables; mod stream_safe; #[rustfmt::skip] @@ -99,19 +119,19 @@ pub mod char { /// as described in /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). pub trait UnicodeNormalization> { - /// Returns an iterator over the string in Unicode Normalization Form D + /// An iterator over the string in Unicode Normalization Form D /// (canonical decomposition). fn nfd(self) -> Decompositions; - /// Returns an iterator over the string in Unicode Normalization Form KD + /// An iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). fn nfkd(self) -> Decompositions; - /// An Iterator over the string in Unicode Normalization Form C + /// An iterator over the string in Unicode Normalization Form C /// (canonical decomposition followed by canonical composition). fn nfc(self) -> Recompositions; - /// An Iterator over the string in Unicode Normalization Form KC + /// An iterator over the string in Unicode Normalization Form KC /// (compatibility decomposition followed by canonical composition). fn nfkc(self) -> Recompositions; @@ -127,9 +147,51 @@ pub trait UnicodeNormalization> { /// implementations the option to recognize them. fn cjk_compat_variants(self) -> Replacements; - /// An Iterator over the string with Conjoining Grapheme Joiner characters - /// inserted according to the Stream-Safe Text Process (UAX15-D4) + /// An iterator over the string with Conjoining Grapheme Joiner characters + /// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4)) fn stream_safe(self) -> StreamSafe; + + /// An iterator over the string with Hangul choseong and jungseong filler characters inserted + /// to ensure that all Korean syllable blocks are in standard form according to [UAX29](https://www.unicode.org/reports/tr29/#Transforming_Into_SKS). + fn standard_korean_syllables(self) -> StandardizeKoreanSyllables; + + /// An iterator over the string in the variant of Unicode Normalization Form KD + /// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode + /// in that it will not produce nonstandard Korean jamo sequences if none were present in the input. + /// (Any string that is in KS X 1026-1 modified NFKD is also in standard Unicode NFKD, + /// but the reverse may not hold.) + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + + fn nfkd_ks_x_1026_1(self) -> Decompositions>; + + /// An iterator over the string in the variant of Unicode Normalization Form C + /// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode + /// in that it will not contain any precomposed LV Hangul syllables immediately followed by conjoining T jamo. + /// (A string that is in KS X 1026-1 modified NFC might not be in standard Unicode NFC, + /// and vice versa.) + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + + fn nfc_ks_x_1026_1(self) -> RecomposeHangul>; + + /// An iterator over the string in the variant of Unicode Normalization Form KC + /// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode + /// in that it will not produce nonstandard Korean jamo sequences if none were present in the input, + /// and it will also not contain any precomposed LV Hangul syllables immediately followed + /// by conjoining T jamo. + /// (A string that is in KS X 1026-1 modified NFKC might not be in standard Unicode NFKC, + /// and vice versa.) + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + + fn nfkc_ks_x_1026_1(self) -> RecomposeHangul>>; + + /// An iterator over the string with Hangul choseong and jungseong filler characters inserted + /// to ensure that all Korean syllable blocks are in standard form according to KS X 1026-1 § 7.8. + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1; } impl<'a> UnicodeNormalization> for &'a str { @@ -162,6 +224,42 @@ impl<'a> UnicodeNormalization> for &'a str { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(self.chars()) } + + #[inline] + fn standard_korean_syllables(self) -> StandardizeKoreanSyllables> { + StandardizeKoreanSyllables::new(self.chars()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkd_ks_x_1026_1(self) -> Decompositions>> { + decompose::new_compatible(NormalizeJamoKdkc::new(self.chars())) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfc_ks_x_1026_1(self) -> RecomposeHangul>> { + RecomposeHangul::new(self.nfc()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkc_ks_x_1026_1(self) -> RecomposeHangul>>> { + RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new( + self.chars(), + ))) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + fn standard_korean_syllables_ks_x_1026_1( + self, + ) -> StandardizeKoreanSyllablesKsX1026_1> { + StandardizeKoreanSyllablesKsX1026_1::new(self.chars()) + } } impl UnicodeNormalization> for char { @@ -194,6 +292,44 @@ impl UnicodeNormalization> for char { fn stream_safe(self) -> StreamSafe> { StreamSafe::new(Some(self).into_iter()) } + + #[inline] + fn standard_korean_syllables(self) -> StandardizeKoreanSyllables> { + StandardizeKoreanSyllables::new(Some(self).into_iter()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkd_ks_x_1026_1(self) -> Decompositions>> { + decompose::new_compatible(NormalizeJamoKdkc::new(Some(self).into_iter())) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfc_ks_x_1026_1(self) -> RecomposeHangul>> { + RecomposeHangul::new(self.nfc()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkc_ks_x_1026_1( + self, + ) -> RecomposeHangul>>> { + RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new( + Some(self).into_iter(), + ))) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + fn standard_korean_syllables_ks_x_1026_1( + self, + ) -> StandardizeKoreanSyllablesKsX1026_1> { + StandardizeKoreanSyllablesKsX1026_1::new(Some(self).into_iter()) + } } impl> UnicodeNormalization for I { @@ -226,4 +362,36 @@ impl> UnicodeNormalization for I { fn stream_safe(self) -> StreamSafe { StreamSafe::new(self) } + + #[inline] + fn standard_korean_syllables(self) -> StandardizeKoreanSyllables { + StandardizeKoreanSyllables::new(self) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkd_ks_x_1026_1(self) -> Decompositions> { + decompose::new_compatible(NormalizeJamoKdkc::new(self)) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfc_ks_x_1026_1(self) -> RecomposeHangul> { + RecomposeHangul::new(self.nfc()) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + #[inline] + fn nfkc_ks_x_1026_1(self) -> RecomposeHangul>> { + RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new(self))) + } + + #[cfg(feature = "ks_x_1026-1")] + #[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] + fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1 { + StandardizeKoreanSyllablesKsX1026_1::new(self) + } } diff --git a/src/normalize.rs b/src/normalize.rs index 3d64a12..c6134a9 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -106,22 +106,27 @@ pub fn compose(a: char, b: char) -> Option { compose_hangul(a, b).or_else(|| composition_table(a, b)) } -// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior -// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior -const S_BASE: u32 = 0xAC00; -const L_BASE: u32 = 0x1100; -const V_BASE: u32 = 0x1161; -const T_BASE: u32 = 0x11A7; -const L_COUNT: u32 = 19; -const V_COUNT: u32 = 21; -const T_COUNT: u32 = 28; -const N_COUNT: u32 = V_COUNT * T_COUNT; -const S_COUNT: u32 = L_COUNT * N_COUNT; - -const S_LAST: u32 = S_BASE + S_COUNT - 1; -const L_LAST: u32 = L_BASE + L_COUNT - 1; -const V_LAST: u32 = V_BASE + V_COUNT - 1; -const T_LAST: u32 = T_BASE + T_COUNT - 1; +/// Constants from Unicode 15.0.0 Section 3.12 Conjoining Jamo Behavior +/// +/// (also found in KS X 1026-1 annex B.1.1 ). +pub mod hangul_constants { + pub const S_BASE: u32 = 0xAC00; + pub const L_BASE: u32 = 0x1100; + pub const V_BASE: u32 = 0x1161; + pub const T_BASE: u32 = 0x11A7; + pub const L_COUNT: u32 = 19; + pub const V_COUNT: u32 = 21; + pub const T_COUNT: u32 = 28; + pub const N_COUNT: u32 = V_COUNT * T_COUNT; + pub const S_COUNT: u32 = L_COUNT * N_COUNT; + + pub const S_LAST: u32 = S_BASE + S_COUNT - 1; + pub const L_LAST: u32 = L_BASE + L_COUNT - 1; + pub const V_LAST: u32 = V_BASE + V_COUNT - 1; + pub const T_LAST: u32 = T_BASE + T_COUNT - 1; +} + +use hangul_constants::*; // Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`, // i.e. `T_BASE + 1 ... T_LAST`. diff --git a/src/standardize_korean_syllables.rs b/src/standardize_korean_syllables.rs new file mode 100644 index 0000000..27540ce --- /dev/null +++ b/src/standardize_korean_syllables.rs @@ -0,0 +1,265 @@ +use core::{iter::FusedIterator, marker::PhantomData}; + +use tinyvec::ArrayVec; + +use crate::normalize::hangul_constants::{N_COUNT, S_BASE, T_COUNT}; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum JamoKind { + L, + V, + T, +} + +impl JamoKind { + fn of(c: char) -> (Option, Option) { + match c { + // L + '\u{1100}'..='\u{115F}' | '\u{A960}'..='\u{A97C}' => { + (Some(JamoKind::L), Some(JamoKind::L)) + } + // V + '\u{1160}'..='\u{11A7}' | '\u{D7B0}'..='\u{D7C6}' => { + (Some(JamoKind::V), Some(JamoKind::V)) + } + // T + '\u{11A8}'..='\u{11FF}' | '\u{D7CB}'..='\u{D7FB}' => { + (Some(JamoKind::T), Some(JamoKind::T)) + } + // LV or LVT + '\u{AC00}'..='\u{D7A3}' => ( + Some(JamoKind::L), + Some(if ((u32::from(c) - S_BASE) % N_COUNT) % T_COUNT == 0 { + // LV + JamoKind::V + } else { + // LVT + JamoKind::T + }), + ), + _ => (None, None), + } + } +} + +trait NormalizeKoreanSyllables { + fn insert_fillers( + next_c: Option, + prev_end_jamo_kind: Option, + next_start_jamo_kind: Option, + buf: &mut ArrayVec<[Option; 3]>, + ) -> Option; +} + +// Used to abstract over UAX29 and KS X 1026-1 rules +#[derive(Clone, Debug)] +struct StandardizeKoreanSyllablesInner { + prev_end_jamo_kind: Option, + buf: ArrayVec<[Option; 3]>, + inner: I, + normalizer: PhantomData, +} + +impl, N: NormalizeKoreanSyllables> Iterator + for StandardizeKoreanSyllablesInner +{ + type Item = char; + + fn next(&mut self) -> Option { + if let Some(c) = self.buf.pop() { + c + } else { + let next_c = self.inner.next(); + let prev_end_jamo_kind = self.prev_end_jamo_kind; + let (next_start_jamo_kind, next_end_jamo_kind) = + next_c.map_or((None, None), JamoKind::of); + self.prev_end_jamo_kind = next_end_jamo_kind; + + N::insert_fillers( + next_c, + prev_end_jamo_kind, + next_start_jamo_kind, + &mut self.buf, + ) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (inner_lo, inner_hi) = self.inner.size_hint(); + let add_factor: usize = self.buf.len(); + ( + inner_lo.saturating_add(add_factor), + inner_hi + .and_then(|h| h.checked_mul(3)) // T → Lf Vf T + .and_then(|h| h.checked_add(add_factor)), + ) + } +} + +impl + FusedIterator, N: NormalizeKoreanSyllables> FusedIterator + for StandardizeKoreanSyllablesInner +{ +} + +impl StandardizeKoreanSyllablesInner { + #[inline] + fn new(iter: I) -> Self { + Self { + prev_end_jamo_kind: None, + buf: ArrayVec::new(), + inner: iter, + normalizer: PhantomData, + } + } +} + +// UAX 29 normalization + +#[derive(Clone, Debug)] +struct Uax29; + +impl NormalizeKoreanSyllables for Uax29 { + #[inline] + fn insert_fillers( + next_c: Option, + prev_end_jamo_kind: Option, + next_start_jamo_kind: Option, + buf: &mut ArrayVec<[Option; 3]>, + ) -> Option { + match (prev_end_jamo_kind, next_start_jamo_kind) { + // Insert choseong filler before V not preceded by L or V + (None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => { + buf.push(next_c); + Some('\u{115F}') + } + // Insert choseong and jungseong fillers before T preceded non-jamo + (None, Some(JamoKind::T)) => { + buf.push(next_c); + buf.push(Some('\u{1160}')); + Some('\u{115F}') + } + // Insert V filler between L and non-jamo + (Some(JamoKind::L), None) => { + buf.push(next_c); + Some('\u{1160}') + } + // For L followed by T, insert V filler, L filler, then another V filler + (Some(JamoKind::L), Some(JamoKind::T)) => { + buf.push(next_c); + buf.push(Some('\u{1160}')); + buf.push(Some('\u{115F}')); + Some('\u{1160}') + } + _ => next_c, + } + } +} + +/// Iterator over a string's characters, with U+115F and U+1160 inserted +/// where needed to ensure all Korean syllable blocks are in standard form +/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables). +#[derive(Clone, Debug)] +pub struct StandardizeKoreanSyllables(StandardizeKoreanSyllablesInner); + +impl StandardizeKoreanSyllables { + #[inline] + pub(crate) fn new(iter: I) -> Self { + Self(StandardizeKoreanSyllablesInner::new(iter)) + } +} + +impl> Iterator for StandardizeKoreanSyllables { + type Item = char; + + fn next(&mut self) -> Option { + self.0.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +impl + FusedIterator> FusedIterator for StandardizeKoreanSyllables {} + +// KS X 1026 1 normalization + +#[cfg(feature = "ks_x_1026-1")] +#[derive(Clone, Debug)] +struct KsX1026_1; + +#[cfg(feature = "ks_x_1026-1")] +impl NormalizeKoreanSyllables for KsX1026_1 { + #[inline] + fn insert_fillers( + next_c: Option, + prev_end_jamo_kind: Option, + next_start_jamo_kind: Option, + buf: &mut ArrayVec<[Option; 3]>, + ) -> Option { + match (prev_end_jamo_kind, next_start_jamo_kind) { + // Insert choseong filler before V preceded by V, T or non-jamo + (None, Some(JamoKind::V)) + | (Some(JamoKind::V), Some(JamoKind::V)) + | (Some(JamoKind::T), Some(JamoKind::V)) => { + buf.push(next_c); + Some('\u{115F}') + } + // Insert choseong and jungseong fillers before T preceded by T or non-jamo + (None, Some(JamoKind::T)) | (Some(JamoKind::T), Some(JamoKind::T)) => { + buf.push(next_c); + buf.push(Some('\u{1160}')); + Some('\u{115F}') + } + // Insert V filler between L and non-jamo or other L + (Some(JamoKind::L), None) | (Some(JamoKind::L), Some(JamoKind::L)) => { + buf.push(next_c); + Some('\u{1160}') + } + // For L followed by T, insert V filler, L filler, then another V filler + (Some(JamoKind::L), Some(JamoKind::T)) => { + buf.push(next_c); + buf.push(Some('\u{1160}')); + buf.push(Some('\u{115F}')); + Some('\u{1160}') + } + _ => next_c, + } + } +} + +/// Iterator over a string's characters, with U+115F and U+1160 inserted +/// where needed to ensure all Korean syllable blocks are in standard form +/// by [KS X 1026-1](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf) rules. +#[cfg(feature = "ks_x_1026-1")] +#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))] +#[derive(Clone, Debug)] +pub struct StandardizeKoreanSyllablesKsX1026_1(StandardizeKoreanSyllablesInner); + +#[cfg(feature = "ks_x_1026-1")] +impl StandardizeKoreanSyllablesKsX1026_1 { + #[inline] + pub(crate) fn new(iter: I) -> Self { + Self(StandardizeKoreanSyllablesInner::new(iter)) + } +} + +#[cfg(feature = "ks_x_1026-1")] +impl> Iterator for StandardizeKoreanSyllablesKsX1026_1 { + type Item = char; + + fn next(&mut self) -> Option { + self.0.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +#[cfg(feature = "ks_x_1026-1")] +impl + FusedIterator> FusedIterator + for StandardizeKoreanSyllablesKsX1026_1 +{ +} diff --git a/tests/ks_x_1026_1.rs b/tests/ks_x_1026_1.rs new file mode 100644 index 0000000..55dc81c --- /dev/null +++ b/tests/ks_x_1026_1.rs @@ -0,0 +1,103 @@ +#![cfg(feature = "ks_x_1026-1")] + +use unicode_normalization::UnicodeNormalization; + +macro_rules! norm_string { + ($method: ident, $input: expr) => { + $input.$method().collect::() + }; +} + +/// § 6.2 +#[test] +fn compatibility_and_halfwidth_hangul_letters() { + // Compatibility + let orig = "\u{3131}\u{314F}"; + assert_eq!(norm_string!(nfkd, orig), "\u{1100}\u{1161}"); + assert_eq!(norm_string!(nfkc, orig), "\u{AC00}"); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1161}" + ); + assert_eq!( + norm_string!(nfkc_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1161}" + ); + + // Halfwidth + let orig = "\u{FFA1}\u{FFC6}"; + assert_eq!(norm_string!(nfd, orig), "\u{FFA1}\u{FFC6}"); + assert_eq!(norm_string!(nfc, orig), "\u{FFA1}\u{FFC6}"); + assert_eq!(norm_string!(nfkd, orig), "\u{1100}\u{1165}"); + assert_eq!(norm_string!(nfkc, orig), "\u{AC70}"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "\u{FFA1}\u{FFC6}"); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1165}" + ); + assert_eq!( + norm_string!(nfkc_ks_x_1026_1, orig), + "\u{1100}\u{1160}\u{115F}\u{1165}" + ); +} + +/// § 6.3 +#[test] +fn hangul_embedded_symbols() { + // Circled + let orig = "\u{3260}"; + assert_eq!(norm_string!(nfd, orig), "\u{3260}"); + assert_eq!(norm_string!(nfc, orig), "\u{3260}"); + assert_eq!(norm_string!(nfkd, orig), "\u{1100}"); + assert_eq!(norm_string!(nfkc, orig), "\u{1100}"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "\u{3260}"); + assert_eq!(norm_string!(nfkd_ks_x_1026_1, orig), "\u{1100}\u{1160}"); + assert_eq!(norm_string!(nfkc_ks_x_1026_1, orig), "\u{1100}\u{1160}"); + + // Parenthesized + let orig = "\u{3200}"; + assert_eq!(norm_string!(nfd, orig), "\u{3200}"); + assert_eq!(norm_string!(nfc, orig), "\u{3200}"); + assert_eq!(norm_string!(nfkd, orig), "(\u{1100})"); + assert_eq!(norm_string!(nfkc, orig), "(\u{1100})"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "\u{3200}"); + assert_eq!(norm_string!(nfkd_ks_x_1026_1, orig), "(\u{1100}\u{1160})"); + assert_eq!(norm_string!(nfkc_ks_x_1026_1, orig), "(\u{1100}\u{1160})"); +} + +/// § 6.4 +#[test] +fn hangul_syllable_blocks() { + let orig = "\u{1100}\u{1161}\u{11EB}"; + assert_eq!(norm_string!(nfd, orig), "\u{1100}\u{1161}\u{11EB}"); + assert_eq!(norm_string!(nfc, orig), "\u{AC00}\u{11EB}"); + assert_eq!(norm_string!(nfkd, orig), "\u{1100}\u{1161}\u{11EB}"); + assert_eq!(norm_string!(nfkc, orig), "\u{AC00}\u{11EB}"); + assert_eq!( + norm_string!(nfc_ks_x_1026_1, orig), + "\u{1100}\u{1161}\u{11EB}" + ); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "\u{1100}\u{1161}\u{11EB}" + ); + assert_eq!( + norm_string!(nfkc_ks_x_1026_1, orig), + "\u{1100}\u{1161}\u{11EB}" + ); +} + +#[test] +fn non_hangul() { + let orig = "ab\u{010D}de\u{0301}"; + assert_eq!(norm_string!(nfd, orig), "abc\u{030C}de\u{0301}"); + assert_eq!(norm_string!(nfc, orig), "ab\u{010D}d\u{00E9}"); + assert_eq!(norm_string!(nfkd, orig), "abc\u{030C}de\u{0301}"); + assert_eq!(norm_string!(nfkc, orig), "ab\u{010D}d\u{00E9}"); + assert_eq!(norm_string!(nfc_ks_x_1026_1, orig), "ab\u{010D}d\u{00E9}"); + assert_eq!( + norm_string!(nfkd_ks_x_1026_1, orig), + "abc\u{030C}de\u{0301}" + ); + assert_eq!(norm_string!(nfkc_ks_x_1026_1, orig), "ab\u{010D}d\u{00E9}"); +} diff --git a/tests/standard_korean_syllables.rs b/tests/standard_korean_syllables.rs new file mode 100644 index 0000000..de5c410 --- /dev/null +++ b/tests/standard_korean_syllables.rs @@ -0,0 +1,74 @@ +use unicode_normalization::UnicodeNormalization; + +const L: char = '\u{1100}'; +const L_F: char = '\u{115F}'; +const V: char = '\u{1161}'; +const V_F: char = '\u{1160}'; +const T: char = '\u{11AE}'; +const LV: char = '\u{AC00}'; +const LVT: char = '\u{AC01}'; + +macro_rules! standardize { + ($input: expr) => { + IntoIterator::into_iter($input) + .standard_korean_syllables() + .collect::>() + }; +} + +/// +#[test] +fn korean_syllable_break_examples() { + // LVT LV LV LVf LfV LfVfT + let orig = [LVT, L, V, LV, L, V_F, L_F, V, L_F, V_F, T]; + assert_eq!(standardize!(orig), orig); + + // LL TT VV TT VV LLVV + let orig = [L, L, T, T, V, V, T, T, V, V, L, LV, V]; + assert_eq!( + standardize!(orig), + [L, L, V_F, L_F, V_F, T, T, L_F, V, V, T, T, L_F, V, V, L, LV, V] + ); +} + +#[cfg(feature = "ks_x_1026-1")] +mod ks_x_1026_1 { + use super::*; + macro_rules! standardize_ks_x_1026_1 { + ($input: expr) => { + IntoIterator::into_iter($input) + .standard_korean_syllables_ks_x_1026_1() + .collect::>() + }; + } + + /// § 7.8 + #[test] + fn korean_syllable_break_examples_ks_x_1026_1() { + // LVT LV LV LVf LfV LfVfT + let orig = [LVT, L, V, LV, L, V_F, L_F, V, L_F, V_F, T]; + assert_eq!(standardize_ks_x_1026_1!(orig), orig); + + // L L T T V VT T V V L LV V + let orig = [L, L, T, T, V, V, T, T, V, V, L, LV, V]; + assert_eq!( + standardize_ks_x_1026_1!(orig), + [ + L, V_F, L, V_F, L_F, V_F, T, L_F, V_F, T, L_F, V, L_F, V, T, L_F, V_F, T, L_F, V, + L_F, V, L, V_F, LV, L_F, V + ] + ); + + //L LVf LfVfT T LfV VT T LfV V L LV V + let orig = [ + L, L, V_F, L_F, V_F, T, T, L_F, V, V, T, T, L_F, V, V, L, LV, V, + ]; + assert_eq!( + standardize_ks_x_1026_1!(orig), + [ + L, V_F, L, V_F, L_F, V_F, T, L_F, V_F, T, L_F, V, L_F, V, T, L_F, V_F, T, L_F, V, + L_F, V, L, V_F, LV, L_F, V + ] + ); + } +}