diff --git a/src/etc/unicode.py b/src/etc/unicode.py index d5c74e367340e..0ec4116c52880 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -169,7 +169,7 @@ def emit_bsearch_range_table(f): else if hi < c { Less } else { Greater } }) != None -}\n\n +}\n """); def emit_property_module(f, mod, tbl): @@ -193,11 +193,11 @@ def emit_property_module(f, mod, tbl): f.write(" pub fn %s(c: char) -> bool {\n" % cat) f.write(" super::bsearch_range_table(c, %s_table)\n" % cat) f.write(" }\n\n") - f.write("}\n") + f.write("}\n\n") def emit_conversions_module(f, lowerupper, upperlower): - f.write("pub mod conversions {\n") + f.write("pub mod conversions {") f.write(""" use cmp::{Equal, Less, Greater}; use slice::ImmutableVector; @@ -225,13 +225,14 @@ def emit_conversions_module(f, lowerupper, upperlower): else { Greater } }) } + """); emit_caseconversion_table(f, "LuLl", upperlower) emit_caseconversion_table(f, "LlLu", lowerupper) f.write("}\n") def emit_caseconversion_table(f, name, table): - f.write(" static %s_table : &'static [(char, char)] = &[\n" % name) + f.write(" static %s_table : &'static [(char, char)] = &[\n" % name) sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0)) ix = 0 for key, value in sorted_table: @@ -261,7 +262,7 @@ def emit_decomp_module(f, canon, compat, combine): compat_keys = compat.keys() compat_keys.sort() - f.write("pub mod decompose {\n"); + f.write("pub mod normalization {\n"); f.write(" use option::Option;\n"); f.write(" use option::{Some, None};\n"); f.write(" use slice::ImmutableVector;\n"); @@ -345,20 +346,28 @@ def emit_decomp_module(f, canon, compat, combine): ix += 1 f.write("\n ];\n") - f.write(" pub fn canonical(c: char, i: |char|) " - + "{ d(c, i, false); }\n\n") - f.write(" pub fn compatibility(c: char, i: |char|) " - +"{ d(c, i, true); }\n\n") - f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" - + " bsearch_range_value_table(c, combining_class_table)\n" - + " }\n\n") - f.write(" fn d(c: char, i: |char|, k: bool) {\n") - f.write(" use iter::Iterator;\n"); + f.write(""" + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } - f.write(" if c <= '\\x7f' { i(c); return; }\n") + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } - # First check the canonical decompositions - f.write(""" + pub fn canonical_combining_class(c: char) -> u8 { + bsearch_range_value_table(c, combining_class_table) + } + + fn d(c: char, i: |char|, k: bool) { + use iter::Iterator; + + // 7-bit ASCII never decomposes + if c <= '\\x7f' { i(c); return; } + + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -367,13 +376,12 @@ def emit_decomp_module(f, canon, compat, combine): return; } None => () - }\n\n""") + } - # Bottom out if we're not doing compat. - f.write(" if !k { i(c); return; }\n") + // Bottom out if we're not doing compat. + if !k { i(c); return; } - # Then check the compatibility decompositions - f.write(""" + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -382,12 +390,45 @@ def emit_decomp_module(f, canon, compat, combine): return; } None => () - }\n\n""") + } - # Finally bottom out. - f.write(" i(c);\n") - f.write(" }\n") - f.write("}\n\n") + // Finally bottom out. + i(c); + } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use cast::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } +} + +""") r = "unicode.rs" for i in [r]: @@ -413,7 +454,6 @@ def emit_decomp_module(f, canon, compat, combine): #![allow(missing_doc)] #![allow(non_uppercase_statics)] - ''') emit_bsearch_range_table(rf); diff --git a/src/libstd/char.rs b/src/libstd/char.rs index 228db221cfc61..76d673e3c8445 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -28,7 +28,12 @@ use cast::transmute; use option::{None, Option, Some}; use iter::{Iterator, range_step}; use str::StrSlice; -use unicode::{derived_property, property, general_category, decompose, conversions}; +use unicode::{derived_property, property, general_category, conversions}; + +/// Returns the canonical decomposition of a character. +pub use unicode::normalization::decompose_canonical; +/// Returns the compatibility decomposition of a character. +pub use unicode::normalization::decompose_compatible; #[cfg(test)] use str::Str; #[cfg(test)] use strbuf::StrBuf; @@ -38,13 +43,14 @@ use unicode::{derived_property, property, general_category, decompose, conversio #[cfg(not(test))] use default::Default; // UTF-8 ranges and tags for encoding characters -static TAG_CONT: uint = 128u; -static MAX_ONE_B: uint = 128u; -static TAG_TWO_B: uint = 192u; -static MAX_TWO_B: uint = 2048u; -static TAG_THREE_B: uint = 224u; -static MAX_THREE_B: uint = 65536u; -static TAG_FOUR_B: uint = 240u; +static TAG_CONT: u8 = 0b1000_0000u8; +static TAG_TWO_B: u8 = 0b1100_0000u8; +static TAG_THREE_B: u8 = 0b1110_0000u8; +static TAG_FOUR_B: u8 = 0b1111_0000u8; +static MAX_ONE_B: u32 = 0x80u32; +static MAX_TWO_B: u32 = 0x800u32; +static MAX_THREE_B: u32 = 0x10000u32; +static MAX_FOUR_B: u32 = 0x200000u32; /* Lu Uppercase_Letter an uppercase letter @@ -289,53 +295,6 @@ pub fn from_digit(num: uint, radix: uint) -> Option { } } -// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior -static S_BASE: uint = 0xAC00; -static L_BASE: uint = 0x1100; -static V_BASE: uint = 0x1161; -static T_BASE: uint = 0x11A7; -static L_COUNT: uint = 19; -static V_COUNT: uint = 21; -static T_COUNT: uint = 28; -static N_COUNT: uint = (V_COUNT * T_COUNT); -static S_COUNT: uint = (L_COUNT * N_COUNT); - -// Decompose a precomposed Hangul syllable -fn decompose_hangul(s: char, f: |char|) { - let si = s as uint - S_BASE; - - let li = si / N_COUNT; - unsafe { - f(transmute((L_BASE + li) as u32)); - - let vi = (si % N_COUNT) / T_COUNT; - f(transmute((V_BASE + vi) as u32)); - - let ti = si % T_COUNT; - if ti > 0 { - f(transmute((T_BASE + ti) as u32)); - } - } -} - -/// Returns the canonical decomposition of a character -pub fn decompose_canonical(c: char, f: |char|) { - if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { - decompose::canonical(c, f); - } else { - decompose_hangul(c, f); - } -} - -/// Returns the compatibility decomposition of a character -pub fn decompose_compatible(c: char, f: |char|) { - if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { - decompose::compatibility(c, f); - } else { - decompose_hangul(c, f); - } -} - /// /// Returns the hexadecimal Unicode escape of a `char` /// @@ -391,12 +350,7 @@ pub fn escape_default(c: char, f: |char|) { /// Returns the amount of bytes this `char` would need if encoded in UTF-8 pub fn len_utf8_bytes(c: char) -> uint { - static MAX_ONE_B: uint = 128u; - static MAX_TWO_B: uint = 2048u; - static MAX_THREE_B: uint = 65536u; - static MAX_FOUR_B: uint = 2097152u; - - let code = c as uint; + let code = c as u32; match () { _ if code < MAX_ONE_B => 1u, _ if code < MAX_TWO_B => 2u, @@ -611,41 +565,40 @@ impl Char for char { fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) } - fn encode_utf8(&self, dst: &mut [u8]) -> uint { - let code = *self as uint; + fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint { + let code = *self as u32; if code < MAX_ONE_B { dst[0] = code as u8; - return 1; + 1 } else if code < MAX_TWO_B { - dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8; - dst[1] = (code & 63u | TAG_CONT) as u8; - return 2; + dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B; + dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT; + 2 } else if code < MAX_THREE_B { - dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8; - dst[1] = (code >> 6u & 63u | TAG_CONT) as u8; - dst[2] = (code & 63u | TAG_CONT) as u8; - return 3; + dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B; + dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; + dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT; + 3 } else { - dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8; - dst[1] = (code >> 12u & 63u | TAG_CONT) as u8; - dst[2] = (code >> 6u & 63u | TAG_CONT) as u8; - dst[3] = (code & 63u | TAG_CONT) as u8; - return 4; + dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B; + dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT; + dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; + dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT; + 4 } } fn encode_utf16(&self, dst: &mut [u16]) -> uint { - let mut ch = *self as uint; - if (ch & 0xFFFF_u) == ch { - // The BMP falls through (assuming non-surrogate, as it - // should) - assert!(ch <= 0xD7FF_u || ch >= 0xE000_u); + let mut ch = *self as u32; + if (ch & 0xFFFF_u32) == ch { + // The BMP falls through (assuming non-surrogate, as it should) + assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32); dst[0] = ch as u16; 1 } else { // Supplementary planes break into surrogates. - assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u); - ch -= 0x1_0000_u; + assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32); + ch -= 0x1_0000_u32; dst[0] = 0xD800_u16 | ((ch >> 10) as u16); dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16); 2 diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 99f1c66e70272..abb6b3180c96f 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -581,25 +581,25 @@ fn canonical_sort(comb: &mut [(char, u8)]) { } #[deriving(Clone)] -enum NormalizationForm { - NFD, - NFKD +enum DecompositionType { + Canonical, + Compatible } -/// External iterator for a string's normalization's characters. +/// External iterator for a string's decomposition's characters. /// Use with the `std::iter` module. #[deriving(Clone)] -pub struct Normalizations<'a> { - kind: NormalizationForm, +pub struct Decompositions<'a> { + kind: DecompositionType, iter: Chars<'a>, buffer: Vec<(char, u8)>, sorted: bool } -impl<'a> Iterator for Normalizations<'a> { +impl<'a> Iterator for Decompositions<'a> { #[inline] fn next(&mut self) -> Option { - use unicode::decompose::canonical_combining_class; + use unicode::normalization::canonical_combining_class; match self.buffer.as_slice().head() { Some(&(c, 0)) => { @@ -615,8 +615,8 @@ impl<'a> Iterator for Normalizations<'a> { } let decomposer = match self.kind { - NFD => char::decompose_canonical, - NFKD => char::decompose_compatible + Canonical => char::decompose_canonical, + Compatible => char::decompose_compatible }; if !self.sorted { @@ -1805,11 +1805,11 @@ pub trait StrSlice<'a> { /// An Iterator over the string in Unicode Normalization Form D /// (canonical decomposition). - fn nfd_chars(&self) -> Normalizations<'a>; + fn nfd_chars(&self) -> Decompositions<'a>; /// An Iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). - fn nfkd_chars(&self) -> Normalizations<'a>; + fn nfkd_chars(&self) -> Decompositions<'a>; /// Returns true if the string contains only whitespace. /// @@ -2388,22 +2388,22 @@ impl<'a> StrSlice<'a> for &'a str { } #[inline] - fn nfd_chars(&self) -> Normalizations<'a> { - Normalizations { + fn nfd_chars(&self) -> Decompositions<'a> { + Decompositions { iter: self.chars(), buffer: Vec::new(), sorted: false, - kind: NFD + kind: Canonical } } #[inline] - fn nfkd_chars(&self) -> Normalizations<'a> { - Normalizations { + fn nfkd_chars(&self) -> Decompositions<'a> { + Decompositions { iter: self.chars(), buffer: Vec::new(), sorted: false, - kind: NFKD + kind: Compatible } } diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index c98861a0fe76b..8751a29fbbc4f 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -103,7 +103,8 @@ pub mod general_category { } } -pub mod decompose { + +pub mod normalization { use option::Option; use option::{Some, None}; use slice::ImmutableVector; @@ -2282,9 +2283,10 @@ pub mod decompose { ('\U0001d185', '\U0001d189', 230), ('\U0001d18a', '\U0001d18b', 220), ('\U0001d1aa', '\U0001d1ad', 230), ('\U0001d242', '\U0001d244', 230) ]; - pub fn canonical(c: char, i: |char|) { d(c, i, false); } - pub fn compatibility(c: char, i: |char|) { d(c, i, true); } + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } + + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } pub fn canonical_combining_class(c: char) -> u8 { bsearch_range_value_table(c, combining_class_table) @@ -2292,8 +2294,17 @@ pub mod decompose { fn d(c: char, i: |char|, k: bool) { use iter::Iterator; + + // 7-bit ASCII never decomposes if c <= '\x7f' { i(c); return; } + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -2304,8 +2315,10 @@ pub mod decompose { None => () } + // Bottom out if we're not doing compat. if !k { i(c); return; } + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -2316,8 +2329,40 @@ pub mod decompose { None => () } + // Finally bottom out. i(c); } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use cast::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } } pub mod derived_property { @@ -4134,6 +4179,7 @@ pub mod derived_property { pub fn XID_Start(c: char) -> bool { super::bsearch_range_table(c, XID_Start_table) } + } pub mod property { @@ -4149,6 +4195,7 @@ pub mod property { pub fn White_Space(c: char) -> bool { super::bsearch_range_table(c, White_Space_table) } + } pub mod conversions { @@ -4667,7 +4714,7 @@ pub mod conversions { ('\U00010426', '\U0001044e'), ('\U00010427', '\U0001044f') ]; - static LlLu_table : &'static [(char, char)] = &[ + static LlLu_table : &'static [(char, char)] = &[ ('\x61', '\x41'), ('\x62', '\x42'), ('\x63', '\x43'), ('\x64', '\x44'), ('\x65', '\x45'), ('\x66', '\x46'),