Skip to content

Commit c24ac7f

Browse files
Add API to transform into standard Korean syllables
See https://www.unicode.org/reports/tr29/#Transforming_Into_SKS
1 parent 6b86cc2 commit c24ac7f

File tree

3 files changed

+189
-0
lines changed

3 files changed

+189
-0
lines changed

src/lib.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ pub use crate::quick_check::{
7373
};
7474
pub use crate::recompose::Recompositions;
7575
pub use crate::replace::Replacements;
76+
pub use crate::standardize_korean_syllables::StandardKoreanSyllables;
7677
pub use crate::stream_safe::StreamSafe;
7778
pub use crate::tables::UNICODE_VERSION;
7879
use core::{option, str::Chars};
@@ -86,6 +87,7 @@ mod perfect_hash;
8687
mod quick_check;
8788
mod recompose;
8889
mod replace;
90+
mod standardize_korean_syllables;
8991
mod stream_safe;
9092

9193
#[rustfmt::skip]
@@ -146,6 +148,10 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
146148
/// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4))
147149
fn stream_safe(self) -> StreamSafe<I>;
148150

151+
/// An iterator over the string with Hangul choseong and jugseong filler characters inserted
152+
/// to ensure that all Korean syllable blocks are in standard form according to [UAX29](https://www.unicode.org/reports/tr29/#Transforming_Into_SKS).
153+
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I>;
154+
149155
/// An iterator over the string in the variant of Unicode Normalization Form KD
150156
/// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode
151157
/// in that it will not produce nonstandard Korean jamo sequences if none were present in the input.
@@ -210,6 +216,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
210216
StreamSafe::new(self.chars())
211217
}
212218

219+
#[inline]
220+
fn standard_korean_syllables(self) -> StandardKoreanSyllables<Chars<'a>> {
221+
StandardKoreanSyllables::new(self.chars())
222+
}
223+
213224
#[cfg(feature = "ks_x_1026-1")]
214225
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
215226
#[inline]
@@ -265,6 +276,11 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
265276
StreamSafe::new(Some(self).into_iter())
266277
}
267278

279+
#[inline]
280+
fn standard_korean_syllables(self) -> StandardKoreanSyllables<option::IntoIter<char>> {
281+
StandardKoreanSyllables::new(Some(self).into_iter())
282+
}
283+
268284
#[cfg(feature = "ks_x_1026-1")]
269285
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
270286
#[inline]
@@ -322,6 +338,11 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
322338
StreamSafe::new(self)
323339
}
324340

341+
#[inline]
342+
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I> {
343+
StandardKoreanSyllables::new(self)
344+
}
345+
325346
#[cfg(feature = "ks_x_1026-1")]
326347
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
327348
#[inline]

src/standardize_korean_syllables.rs

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
use core::iter::FusedIterator;
2+
3+
use tinyvec::ArrayVec;
4+
5+
use crate::normalize::hangul_constants::{N_COUNT, S_BASE, T_COUNT};
6+
7+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
8+
enum JamoKind {
9+
L,
10+
V,
11+
T,
12+
}
13+
14+
impl JamoKind {
15+
fn of(c: char) -> (Option<Self>, Option<Self>) {
16+
match c {
17+
// L
18+
'\u{1100}'..='\u{115F}' | '\u{A960}'..='\u{A97C}' => {
19+
(Some(JamoKind::L), Some(JamoKind::L))
20+
}
21+
// V
22+
'\u{1160}'..='\u{11A7}' | '\u{D7B0}'..='\u{D7C6}' => {
23+
(Some(JamoKind::V), Some(JamoKind::V))
24+
}
25+
// T
26+
'\u{11A8}'..='\u{11FF}' | '\u{D7CB}'..='\u{D7FB}' => {
27+
(Some(JamoKind::T), Some(JamoKind::T))
28+
}
29+
// LV or LVT
30+
'\u{AC00}'..='\u{D7A3}' => (
31+
Some(JamoKind::L),
32+
Some(if ((u32::from(c) - S_BASE) % N_COUNT) % T_COUNT == 0 {
33+
// LV
34+
JamoKind::V
35+
} else {
36+
// LVT
37+
JamoKind::T
38+
}),
39+
),
40+
_ => (None, None),
41+
}
42+
}
43+
}
44+
45+
/// Iterator over a string's characters, with '\u{115F}' and '\u{1160}' inserted
46+
/// where needed to ensure all Korean syllable blocks are in standard form
47+
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
48+
#[derive(Clone, Debug)]
49+
pub struct StandardKoreanSyllables<I> {
50+
prev_end_jamo_kind: Option<JamoKind>,
51+
buf: ArrayVec<[Option<char>; 3]>,
52+
inner: I,
53+
}
54+
55+
impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
56+
type Item = char;
57+
58+
fn next(&mut self) -> Option<Self::Item> {
59+
if let Some(c) = self.buf.pop() {
60+
c
61+
} else {
62+
let next_c = self.inner.next();
63+
let prev_end_jamo_kind = self.prev_end_jamo_kind;
64+
let (next_start_jamo_kind, next_end_jamo_kind) =
65+
next_c.map_or((None, None), JamoKind::of);
66+
self.prev_end_jamo_kind = next_end_jamo_kind;
67+
68+
insert_fillers(
69+
next_c,
70+
prev_end_jamo_kind,
71+
next_start_jamo_kind,
72+
&mut self.buf,
73+
)
74+
}
75+
}
76+
77+
#[inline]
78+
fn size_hint(&self) -> (usize, Option<usize>) {
79+
let (inner_lo, inner_hi) = self.inner.size_hint();
80+
let add_factor: usize = self.buf.len();
81+
(
82+
inner_lo.saturating_add(add_factor),
83+
inner_hi
84+
.and_then(|h| h.checked_mul(3)) // T → Lf Vf T
85+
.and_then(|h| h.checked_add(add_factor)),
86+
)
87+
}
88+
}
89+
90+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardKoreanSyllables<I> {}
91+
92+
#[inline]
93+
fn insert_fillers(
94+
next_c: Option<char>,
95+
prev_end_jamo_kind: Option<JamoKind>,
96+
next_start_jamo_kind: Option<JamoKind>,
97+
buf: &mut ArrayVec<[Option<char>; 3]>,
98+
) -> Option<char> {
99+
match (prev_end_jamo_kind, next_start_jamo_kind) {
100+
// Insert choseong filler before V not preceded by L or V
101+
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
102+
buf.push(next_c);
103+
Some('\u{115F}')
104+
}
105+
// Insert choseong and jungseong fillers before T preceded non-jamo
106+
(None, Some(JamoKind::T)) => {
107+
buf.push(next_c);
108+
buf.push(Some('\u{1160}'));
109+
Some('\u{115F}')
110+
}
111+
// Insert V filler between L and non-jamo
112+
(Some(JamoKind::L), None) => {
113+
buf.push(next_c);
114+
Some('\u{1160}')
115+
}
116+
// For L followed by T, insert V filler, L filler, then another V filler
117+
(Some(JamoKind::L), Some(JamoKind::T)) => {
118+
buf.push(next_c);
119+
buf.push(Some('\u{1160}'));
120+
buf.push(Some('\u{115F}'));
121+
Some('\u{1160}')
122+
}
123+
_ => next_c,
124+
}
125+
}
126+
127+
impl<I> StandardKoreanSyllables<I> {
128+
#[inline]
129+
pub(crate) fn new(iter: I) -> Self {
130+
Self {
131+
prev_end_jamo_kind: None,
132+
buf: ArrayVec::new(),
133+
inner: iter,
134+
}
135+
}
136+
}

tests/standard_korean_syllables.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
use unicode_normalization::UnicodeNormalization;
2+
3+
macro_rules! standardize {
4+
($input: expr) => {
5+
IntoIterator::into_iter($input)
6+
.standard_korean_syllables()
7+
.collect::<Vec<char>>()
8+
};
9+
}
10+
11+
/// <https://www.unicode.org/reports/tr29/#Korean_Syllable_Break_Examples>
12+
#[test]
13+
fn korean_syllable_break_examples() {
14+
const L: char = '\u{1100}';
15+
const L_F: char = '\u{115F}';
16+
const V: char = '\u{1161}';
17+
const V_F: char = '\u{1160}';
18+
const T: char = '\u{11AE}';
19+
const LV: char = '\u{AC00}';
20+
const LVT: char = '\u{AC01}';
21+
22+
// LVT LV LV LVf LfV LfVfT
23+
let orig = [LVT, L, V, LV, L, V_F, L_F, V, L_F, V_F, T];
24+
assert_eq!(standardize!(orig), orig);
25+
26+
// LL TT VV TT VV LLVV
27+
let orig = [L, L, T, T, V, V, T, T, V, V, L, LV, V];
28+
assert_eq!(
29+
standardize!(orig),
30+
[L, L, V_F, L_F, V_F, T, T, L_F, V, V, T, T, L_F, V, V, L, LV, V]
31+
);
32+
}

0 commit comments

Comments
 (0)