Skip to content

Commit dfe38db

Browse files
committed
Fix handling of upper/lowercase, and whitespace
1 parent c234614 commit dfe38db

File tree

5 files changed

+689
-29
lines changed

5 files changed

+689
-29
lines changed

src/etc/unicode.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def load_unicode_data(f):
4141
continue
4242
[code, name, gencat, combine, bidi,
4343
decomp, deci, digit, num, mirror,
44-
old, iso, upcase, lowcsae, titlecase ] = fields
44+
old, iso, upcase, lowcase, titlecase ] = fields
4545

4646
code = int(code, 16)
4747

@@ -89,11 +89,9 @@ def load_unicode_data(f):
8989

9090
return (canon_decomp, compat_decomp, gencats, combines)
9191

92-
93-
def load_derived_core_properties(f):
92+
def load_properties(f, interestingprops):
9493
fetch(f)
95-
derivedprops = {}
96-
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
94+
props = {}
9795
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
9896
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
9997

@@ -118,10 +116,10 @@ def load_derived_core_properties(f):
118116
continue
119117
d_lo = int(d_lo, 16)
120118
d_hi = int(d_hi, 16)
121-
if prop not in derivedprops:
122-
derivedprops[prop] = []
123-
derivedprops[prop].append((d_lo, d_hi))
124-
return derivedprops
119+
if prop not in props:
120+
props[prop] = []
121+
props[prop].append((d_lo, d_hi))
122+
return props
125123

126124
def escape_char(c):
127125
if c <= 0xff:
@@ -376,5 +374,9 @@ def emit_decomp_module(f, canon, compat, combine):
376374

377375
emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
378376

379-
derived = load_derived_core_properties("DerivedCoreProperties.txt")
377+
derived = load_properties("DerivedCoreProperties.txt",
378+
["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
380379
emit_property_module(rf, "derived_property", derived)
380+
381+
props = load_properties("PropList.txt", ["White_Space"])
382+
emit_property_module(rf, "property", props)

src/libstd/char.rs

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use cast::transmute;
1414
use option::{None, Option, Some};
1515
use iter::{Iterator, range_step};
1616
use str::StrSlice;
17-
use unicode::{derived_property, general_category, decompose};
17+
use unicode::{derived_property, property, general_category, decompose};
1818
use to_str::ToStr;
1919
use str;
2020

@@ -89,30 +89,28 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
8989

9090
///
9191
/// Indicates whether a character is in lower case, defined
92-
/// in terms of the Unicode General Category 'Ll'
92+
/// in terms of the Unicode Derived Core Property 'Lowercase'.
9393
///
9494
#[inline]
95-
pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
95+
pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) }
9696

9797
///
9898
/// Indicates whether a character is in upper case, defined
99-
/// in terms of the Unicode General Category 'Lu'.
99+
/// in terms of the Unicode Derived Core Property 'Uppercase'.
100100
///
101101
#[inline]
102-
pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
102+
pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) }
103103

104104
///
105105
/// Indicates whether a character is whitespace. Whitespace is defined in
106-
/// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
107-
/// additional 'Cc'-category control codes in the range [0x09, 0x0d]
106+
/// terms of the Unicode Property 'White_Space'.
108107
///
109108
#[inline]
110109
pub fn is_whitespace(c: char) -> bool {
110+
// As an optimization ASCII whitespace characters are checked separately
111111
c == ' '
112112
|| ('\x09' <= c && c <= '\x0d')
113-
|| general_category::Zs(c)
114-
|| general_category::Zl(c)
115-
|| general_category::Zp(c)
113+
|| property::White_Space(c)
116114
}
117115

118116
///

0 commit comments

Comments
 (0)