Skip to content

Clean up unicode code in libstd #13770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 68 additions & 28 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def emit_bsearch_range_table(f):
else if hi < c { Less }
else { Greater }
}) != None
}\n\n
}\n
""");

def emit_property_module(f, mod, tbl):
Expand All @@ -193,11 +193,11 @@ def emit_property_module(f, mod, tbl):
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n")
f.write("}\n\n")


def emit_conversions_module(f, lowerupper, upperlower):
f.write("pub mod conversions {\n")
f.write("pub mod conversions {")
f.write("""
use cmp::{Equal, Less, Greater};
use slice::ImmutableVector;
Expand Down Expand Up @@ -225,13 +225,14 @@ def emit_conversions_module(f, lowerupper, upperlower):
else { Greater }
})
}

""");
emit_caseconversion_table(f, "LuLl", upperlower)
emit_caseconversion_table(f, "LlLu", lowerupper)
f.write("}\n")

def emit_caseconversion_table(f, name, table):
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
f.write(" static %s_table : &'static [(char, char)] = &[\n" % name)
sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
ix = 0
for key, value in sorted_table:
Expand Down Expand Up @@ -261,7 +262,7 @@ def emit_decomp_module(f, canon, compat, combine):

compat_keys = compat.keys()
compat_keys.sort()
f.write("pub mod decompose {\n");
f.write("pub mod normalization {\n");
f.write(" use option::Option;\n");
f.write(" use option::{Some, None};\n");
f.write(" use slice::ImmutableVector;\n");
Expand Down Expand Up @@ -345,20 +346,28 @@ def emit_decomp_module(f, canon, compat, combine):
ix += 1
f.write("\n ];\n")

f.write(" pub fn canonical(c: char, i: |char|) "
+ "{ d(c, i, false); }\n\n")
f.write(" pub fn compatibility(c: char, i: |char|) "
+"{ d(c, i, true); }\n\n")
f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n"
+ " bsearch_range_value_table(c, combining_class_table)\n"
+ " }\n\n")
f.write(" fn d(c: char, i: |char|, k: bool) {\n")
f.write(" use iter::Iterator;\n");
f.write("""
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }

f.write(" if c <= '\\x7f' { i(c); return; }\n")
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }

# First check the canonical decompositions
f.write("""
pub fn canonical_combining_class(c: char) -> u8 {
bsearch_range_value_table(c, combining_class_table)
}

fn d(c: char, i: |char|, k: bool) {
use iter::Iterator;

// 7-bit ASCII never decomposes
if c <= '\\x7f' { i(c); return; }

// Perform decomposition for Hangul
if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
decompose_hangul(c, i);
return;
}

// First check the canonical decompositions
match bsearch_table(c, canonical_table) {
Some(canon) => {
for x in canon.iter() {
Expand All @@ -367,13 +376,12 @@ def emit_decomp_module(f, canon, compat, combine):
return;
}
None => ()
}\n\n""")
}

# Bottom out if we're not doing compat.
f.write(" if !k { i(c); return; }\n")
// Bottom out if we're not doing compat.
if !k { i(c); return; }

# Then check the compatibility decompositions
f.write("""
// Then check the compatibility decompositions
match bsearch_table(c, compatibility_table) {
Some(compat) => {
for x in compat.iter() {
Expand All @@ -382,12 +390,45 @@ def emit_decomp_module(f, canon, compat, combine):
return;
}
None => ()
}\n\n""")
}

# Finally bottom out.
f.write(" i(c);\n")
f.write(" }\n")
f.write("}\n\n")
// Finally bottom out.
i(c);
}

// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: u32 = 0xAC00;
static L_BASE: u32 = 0x1100;
static V_BASE: u32 = 0x1161;
static T_BASE: u32 = 0x11A7;
static L_COUNT: u32 = 19;
static V_COUNT: u32 = 21;
static T_COUNT: u32 = 28;
static N_COUNT: u32 = (V_COUNT * T_COUNT);
static S_COUNT: u32 = (L_COUNT * N_COUNT);

// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
use cast::transmute;

let si = s as u32 - S_BASE;

let li = si / N_COUNT;
unsafe {
f(transmute(L_BASE + li));

let vi = (si % N_COUNT) / T_COUNT;
f(transmute(V_BASE + vi));

let ti = si % T_COUNT;
if ti > 0 {
f(transmute(T_BASE + ti));
}
}
}
}

""")

r = "unicode.rs"
for i in [r]:
Expand All @@ -413,7 +454,6 @@ def emit_decomp_module(f, canon, compat, combine):

#![allow(missing_doc)]
#![allow(non_uppercase_statics)]

''')

emit_bsearch_range_table(rf);
Expand Down
119 changes: 36 additions & 83 deletions src/libstd/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ use cast::transmute;
use option::{None, Option, Some};
use iter::{Iterator, range_step};
use str::StrSlice;
use unicode::{derived_property, property, general_category, decompose, conversions};
use unicode::{derived_property, property, general_category, conversions};

/// Returns the canonical decomposition of a character.
pub use unicode::normalization::decompose_canonical;
/// Returns the compatibility decomposition of a character.
pub use unicode::normalization::decompose_compatible;

#[cfg(test)] use str::Str;
#[cfg(test)] use strbuf::StrBuf;
Expand All @@ -38,13 +43,14 @@ use unicode::{derived_property, property, general_category, decompose, conversio
#[cfg(not(test))] use default::Default;

// UTF-8 ranges and tags for encoding characters
static TAG_CONT: uint = 128u;
static MAX_ONE_B: uint = 128u;
static TAG_TWO_B: uint = 192u;
static MAX_TWO_B: uint = 2048u;
static TAG_THREE_B: uint = 224u;
static MAX_THREE_B: uint = 65536u;
static TAG_FOUR_B: uint = 240u;
static TAG_CONT: u8 = 0b1000_0000u8;
static TAG_TWO_B: u8 = 0b1100_0000u8;
static TAG_THREE_B: u8 = 0b1110_0000u8;
static TAG_FOUR_B: u8 = 0b1111_0000u8;
static MAX_ONE_B: u32 = 0x80u32;
static MAX_TWO_B: u32 = 0x800u32;
static MAX_THREE_B: u32 = 0x10000u32;
static MAX_FOUR_B: u32 = 0x200000u32;

/*
Lu Uppercase_Letter an uppercase letter
Expand Down Expand Up @@ -289,53 +295,6 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
}
}

// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: uint = 0xAC00;
static L_BASE: uint = 0x1100;
static V_BASE: uint = 0x1161;
static T_BASE: uint = 0x11A7;
static L_COUNT: uint = 19;
static V_COUNT: uint = 21;
static T_COUNT: uint = 28;
static N_COUNT: uint = (V_COUNT * T_COUNT);
static S_COUNT: uint = (L_COUNT * N_COUNT);

// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
let si = s as uint - S_BASE;

let li = si / N_COUNT;
unsafe {
f(transmute((L_BASE + li) as u32));

let vi = (si % N_COUNT) / T_COUNT;
f(transmute((V_BASE + vi) as u32));

let ti = si % T_COUNT;
if ti > 0 {
f(transmute((T_BASE + ti) as u32));
}
}
}

/// Returns the canonical decomposition of a character
pub fn decompose_canonical(c: char, f: |char|) {
if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
decompose::canonical(c, f);
} else {
decompose_hangul(c, f);
}
}

/// Returns the compatibility decomposition of a character
pub fn decompose_compatible(c: char, f: |char|) {
if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
decompose::compatibility(c, f);
} else {
decompose_hangul(c, f);
}
}

///
/// Returns the hexadecimal Unicode escape of a `char`
///
Expand Down Expand Up @@ -391,12 +350,7 @@ pub fn escape_default(c: char, f: |char|) {

/// Returns the amount of bytes this `char` would need if encoded in UTF-8
pub fn len_utf8_bytes(c: char) -> uint {
static MAX_ONE_B: uint = 128u;
static MAX_TWO_B: uint = 2048u;
static MAX_THREE_B: uint = 65536u;
static MAX_FOUR_B: uint = 2097152u;

let code = c as uint;
let code = c as u32;
match () {
_ if code < MAX_ONE_B => 1u,
_ if code < MAX_TWO_B => 2u,
Expand Down Expand Up @@ -611,41 +565,40 @@ impl Char for char {

fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }

fn encode_utf8(&self, dst: &mut [u8]) -> uint {
let code = *self as uint;
fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
let code = *self as u32;
if code < MAX_ONE_B {
dst[0] = code as u8;
return 1;
1
} else if code < MAX_TWO_B {
dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8;
dst[1] = (code & 63u | TAG_CONT) as u8;
return 2;
dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
2
} else if code < MAX_THREE_B {
dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8;
dst[1] = (code >> 6u & 63u | TAG_CONT) as u8;
dst[2] = (code & 63u | TAG_CONT) as u8;
return 3;
dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
3
} else {
dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8;
dst[1] = (code >> 12u & 63u | TAG_CONT) as u8;
dst[2] = (code >> 6u & 63u | TAG_CONT) as u8;
dst[3] = (code & 63u | TAG_CONT) as u8;
return 4;
dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT;
dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
4
}
}

fn encode_utf16(&self, dst: &mut [u16]) -> uint {
let mut ch = *self as uint;
if (ch & 0xFFFF_u) == ch {
// The BMP falls through (assuming non-surrogate, as it
// should)
assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
let mut ch = *self as u32;
if (ch & 0xFFFF_u32) == ch {
// The BMP falls through (assuming non-surrogate, as it should)
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
dst[0] = ch as u16;
1
} else {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
ch -= 0x1_0000_u;
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
ch -= 0x1_0000_u32;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2
Expand Down
Loading