Skip to content

Added Ascii encoding, some cleanups #5980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 23, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand Down Expand Up @@ -234,6 +234,21 @@ pub fn escape_default(c: char) -> ~str {
}
}

/// Returns the amount of bytes this character would need if encoded in utf8
pub fn len_utf8_bytes(c: char) -> uint {
static max_one_b: uint = 128u;
static max_two_b: uint = 2048u;
static max_three_b: uint = 65536u;
static max_four_b: uint = 2097152u;

let code = c as uint;
if code < max_one_b { 1u }
else if code < max_two_b { 2u }
else if code < max_three_b { 3u }
else if code < max_four_b { 4u }
else { fail!(~"invalid character!") }
}

/**
* Compare two chars
*
Expand Down Expand Up @@ -334,7 +349,6 @@ fn test_escape_default() {
assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6");
}


#[test]
fn test_escape_unicode() {
assert_eq!(escape_unicode('\x00'), ~"\\x00");
Expand Down
3 changes: 3 additions & 0 deletions src/libcore/core.rc
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ pub mod vec;
pub mod at_vec;
pub mod str;

#[path = "str/ascii.rs"]
pub mod ascii;

pub mod ptr;
pub mod owned;
pub mod managed;
Expand Down
3 changes: 2 additions & 1 deletion src/libcore/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ pub use path::Path;
pub use path::PosixPath;
pub use path::WindowsPath;
pub use ptr::Ptr;
pub use ascii::{Ascii, AsciiCast, OwnedAsciiCast, AsciiStr};
pub use str::{StrSlice, OwnedStr};
pub use to_bytes::IterBytes;
pub use to_str::ToStr;
pub use to_str::{ToStr, ToStrConsume};
pub use tuple::{CopyableTuple, ImmutableTuple, ExtendedTupleOps};
pub use vec::{CopyableVector, ImmutableVector};
pub use vec::{ImmutableEqVector, ImmutableCopyableVector};
Expand Down
53 changes: 25 additions & 28 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2012 The Rust Project Developers. See the COPYRIGHT
// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
Expand Down Expand Up @@ -789,16 +789,18 @@ pub fn each_split_within<'a>(ss: &'a str,

/// Convert a string to lowercase. ASCII only
pub fn to_lower(s: &str) -> ~str {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should just be removed since they're possible already by converting to [Ascii] - libc isn't actually ASCII-only, it's platform dependant (the ones from glibc are definitely locale-aware).

Functions named to_lower and to_upper in str definitely should have full Unicode support.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, wanted to make sure the the PR gets in first before starting to remove those functions, because that will surely touch some more code.

map(s,
|c| unsafe{(libc::tolower(c as libc::c_char)) as char}
)
do map(s) |c| {
assert!(char::is_ascii(c));
(unsafe{libc::tolower(c as libc::c_char)}) as char
}
}

/// Convert a string to uppercase. ASCII only
pub fn to_upper(s: &str) -> ~str {
map(s,
|c| unsafe{(libc::toupper(c as libc::c_char)) as char}
)
do map(s) |c| {
assert!(char::is_ascii(c));
(unsafe{libc::toupper(c as libc::c_char)}) as char
}
}

/**
Expand Down Expand Up @@ -2317,20 +2319,20 @@ pub mod raw {
}

/// Removes the last byte from a string and returns it. (Not UTF-8 safe).
pub fn pop_byte(s: &mut ~str) -> u8 {
pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
let len = len(*s);
assert!((len > 0u));
let b = s[len - 1u];
unsafe { set_len(s, len - 1u) };
set_len(s, len - 1u);
return b;
}

/// Removes the first byte from a string and returns it. (Not UTF-8 safe).
pub fn shift_byte(s: &mut ~str) -> u8 {
pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
let len = len(*s);
assert!((len > 0u));
let b = s[0];
*s = unsafe { raw::slice_bytes_owned(*s, 1u, len) };
*s = raw::slice_bytes_owned(*s, 1u, len);
return b;
}

Expand Down Expand Up @@ -3096,12 +3098,11 @@ mod tests {

#[test]
fn test_to_lower() {
unsafe {
assert!(~"" == map(~"",
|c| libc::tolower(c as c_char) as char));
assert!(~"ymca" == map(~"YMCA",
|c| libc::tolower(c as c_char) as char));
}
// libc::tolower, and hence str::to_lower
// are culturally insensitive: they only work for ASCII
// (see Issue #1347)
assert!(~"" == to_lower(""));
assert!(~"ymca" == to_lower("YMCA"));
}

#[test]
Expand Down Expand Up @@ -3346,15 +3347,15 @@ mod tests {
#[test]
fn test_shift_byte() {
let mut s = ~"ABC";
let b = raw::shift_byte(&mut s);
let b = unsafe{raw::shift_byte(&mut s)};
assert!((s == ~"BC"));
assert!((b == 65u8));
}

#[test]
fn test_pop_byte() {
let mut s = ~"ABC";
let b = raw::pop_byte(&mut s);
let b = unsafe{raw::pop_byte(&mut s)};
assert!((s == ~"AB"));
assert!((b == 67u8));
}
Expand Down Expand Up @@ -3666,12 +3667,8 @@ mod tests {

#[test]
fn test_map() {
unsafe {
assert!(~"" == map(~"", |c|
libc::toupper(c as c_char) as char));
assert!(~"YMCA" == map(~"ymca",
|c| libc::toupper(c as c_char) as char));
}
assert!(~"" == map(~"", |c| unsafe {libc::toupper(c as c_char)} as char));
assert!(~"YMCA" == map(~"ymca", |c| unsafe {libc::toupper(c as c_char)} as char));
}

#[test]
Expand All @@ -3685,11 +3682,11 @@ mod tests {

#[test]
fn test_any() {
assert!(false == any(~"", char::is_uppercase));
assert!(false == any(~"", char::is_uppercase));
assert!(false == any(~"ymca", char::is_uppercase));
assert!(true == any(~"YMCA", char::is_uppercase));
assert!(true == any(~"yMCA", char::is_uppercase));
assert!(true == any(~"Ymcy", char::is_uppercase));
assert!(true == any(~"yMCA", char::is_uppercase));
assert!(true == any(~"Ymcy", char::is_uppercase));
}

#[test]
Expand Down
Loading