diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 6ca33540ceef6..c07a31490c346 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -1,4 +1,4 @@ -// Copyright 2012 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -234,6 +234,21 @@ pub fn escape_default(c: char) -> ~str { } } +/// Returns the amount of bytes this character would need if encoded in utf8 +pub fn len_utf8_bytes(c: char) -> uint { + static max_one_b: uint = 128u; + static max_two_b: uint = 2048u; + static max_three_b: uint = 65536u; + static max_four_b: uint = 2097152u; + + let code = c as uint; + if code < max_one_b { 1u } + else if code < max_two_b { 2u } + else if code < max_three_b { 3u } + else if code < max_four_b { 4u } + else { fail!(~"invalid character!") } +} + /** * Compare two chars * @@ -334,7 +349,6 @@ fn test_escape_default() { assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6"); } - #[test] fn test_escape_unicode() { assert_eq!(escape_unicode('\x00'), ~"\\x00"); diff --git a/src/libcore/core.rc b/src/libcore/core.rc index 81190ea8fc62e..6d38d72e3f649 100644 --- a/src/libcore/core.rc +++ b/src/libcore/core.rc @@ -159,6 +159,9 @@ pub mod vec; pub mod at_vec; pub mod str; +#[path = "str/ascii.rs"] +pub mod ascii; + pub mod ptr; pub mod owned; pub mod managed; diff --git a/src/libcore/prelude.rs b/src/libcore/prelude.rs index 822fb2e476beb..fb4f9188b3bd0 100644 --- a/src/libcore/prelude.rs +++ b/src/libcore/prelude.rs @@ -40,9 +40,10 @@ pub use path::Path; pub use path::PosixPath; pub use path::WindowsPath; pub use ptr::Ptr; +pub use ascii::{Ascii, AsciiCast, OwnedAsciiCast, AsciiStr}; pub use str::{StrSlice, OwnedStr}; pub use to_bytes::IterBytes; -pub use to_str::ToStr; +pub use to_str::{ToStr, ToStrConsume}; pub use tuple::{CopyableTuple, ImmutableTuple, ExtendedTupleOps}; pub use vec::{CopyableVector, ImmutableVector}; pub use vec::{ImmutableEqVector, ImmutableCopyableVector}; diff --git a/src/libcore/str.rs b/src/libcore/str.rs index d72b4a71e2a6e..dc97af22c470f 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -1,4 +1,4 @@ -// Copyright 2012 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -789,16 +789,18 @@ pub fn each_split_within<'a>(ss: &'a str, /// Convert a string to lowercase. ASCII only pub fn to_lower(s: &str) -> ~str { - map(s, - |c| unsafe{(libc::tolower(c as libc::c_char)) as char} - ) + do map(s) |c| { + assert!(char::is_ascii(c)); + (unsafe{libc::tolower(c as libc::c_char)}) as char + } } /// Convert a string to uppercase. ASCII only pub fn to_upper(s: &str) -> ~str { - map(s, - |c| unsafe{(libc::toupper(c as libc::c_char)) as char} - ) + do map(s) |c| { + assert!(char::is_ascii(c)); + (unsafe{libc::toupper(c as libc::c_char)}) as char + } } /** @@ -2317,20 +2319,20 @@ pub mod raw { } /// Removes the last byte from a string and returns it. (Not UTF-8 safe). - pub fn pop_byte(s: &mut ~str) -> u8 { + pub unsafe fn pop_byte(s: &mut ~str) -> u8 { let len = len(*s); assert!((len > 0u)); let b = s[len - 1u]; - unsafe { set_len(s, len - 1u) }; + set_len(s, len - 1u); return b; } /// Removes the first byte from a string and returns it. (Not UTF-8 safe). - pub fn shift_byte(s: &mut ~str) -> u8 { + pub unsafe fn shift_byte(s: &mut ~str) -> u8 { let len = len(*s); assert!((len > 0u)); let b = s[0]; - *s = unsafe { raw::slice_bytes_owned(*s, 1u, len) }; + *s = raw::slice_bytes_owned(*s, 1u, len); return b; } @@ -3096,12 +3098,11 @@ mod tests { #[test] fn test_to_lower() { - unsafe { - assert!(~"" == map(~"", - |c| libc::tolower(c as c_char) as char)); - assert!(~"ymca" == map(~"YMCA", - |c| libc::tolower(c as c_char) as char)); - } + // libc::tolower, and hence str::to_lower + // are culturally insensitive: they only work for ASCII + // (see Issue #1347) + assert!(~"" == to_lower("")); + assert!(~"ymca" == to_lower("YMCA")); } #[test] @@ -3346,7 +3347,7 @@ mod tests { #[test] fn test_shift_byte() { let mut s = ~"ABC"; - let b = raw::shift_byte(&mut s); + let b = unsafe{raw::shift_byte(&mut s)}; assert!((s == ~"BC")); assert!((b == 65u8)); } @@ -3354,7 +3355,7 @@ mod tests { #[test] fn test_pop_byte() { let mut s = ~"ABC"; - let b = raw::pop_byte(&mut s); + let b = unsafe{raw::pop_byte(&mut s)}; assert!((s == ~"AB")); assert!((b == 67u8)); } @@ -3666,12 +3667,8 @@ mod tests { #[test] fn test_map() { - unsafe { - assert!(~"" == map(~"", |c| - libc::toupper(c as c_char) as char)); - assert!(~"YMCA" == map(~"ymca", - |c| libc::toupper(c as c_char) as char)); - } + assert!(~"" == map(~"", |c| unsafe {libc::toupper(c as c_char)} as char)); + assert!(~"YMCA" == map(~"ymca", |c| unsafe {libc::toupper(c as c_char)} as char)); } #[test] @@ -3685,11 +3682,11 @@ mod tests { #[test] fn test_any() { - assert!(false == any(~"", char::is_uppercase)); + assert!(false == any(~"", char::is_uppercase)); assert!(false == any(~"ymca", char::is_uppercase)); assert!(true == any(~"YMCA", char::is_uppercase)); - assert!(true == any(~"yMCA", char::is_uppercase)); - assert!(true == any(~"Ymcy", char::is_uppercase)); + assert!(true == any(~"yMCA", char::is_uppercase)); + assert!(true == any(~"Ymcy", char::is_uppercase)); } #[test] diff --git a/src/libcore/str/ascii.rs b/src/libcore/str/ascii.rs new file mode 100644 index 0000000000000..339274ab47e4f --- /dev/null +++ b/src/libcore/str/ascii.rs @@ -0,0 +1,268 @@ +// Copyright 2013 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use to_str::{ToStr,ToStrConsume}; +use str; +use cast; + +/// Datatype to hold one ascii character. It is 8 bit long. +#[deriving(Clone, Eq)] +pub struct Ascii { priv chr: u8 } + +pub impl Ascii { + /// Converts a ascii character into a `u8`. + #[inline(always)] + fn to_byte(self) -> u8 { + self.chr + } + + /// Converts a ascii character into a `char`. + #[inline(always)] + fn to_char(self) -> char { + self.chr as char + } + + /// Convert to lowercase. + #[inline(always)] + fn to_lower(self) -> Ascii { + if self.chr >= 65 && self.chr <= 90 { + Ascii{chr: self.chr | 0x20 } + } else { + self + } + } + + /// Convert to uppercase. + #[inline(always)] + fn to_upper(self) -> Ascii { + if self.chr >= 97 && self.chr <= 122 { + Ascii{chr: self.chr & !0x20 } + } else { + self + } + } + + // Compares two ascii characters of equality, ignoring case. + #[inline(always)] + fn eq_ignore_case(self, other: Ascii) -> bool { + self.to_lower().chr == other.to_lower().chr + } +} + +impl ToStr for Ascii { + #[inline(always)] + fn to_str(&self) -> ~str { str::from_bytes(['\'' as u8, self.chr, '\'' as u8]) } +} + +/// Trait for converting into an ascii type. +pub trait AsciiCast { + /// Convert to an ascii type + fn to_ascii(&self) -> T; + + /// Check if convertible to ascii + fn is_ascii(&self) -> bool; +} + +impl<'self> AsciiCast<&'self[Ascii]> for &'self [u8] { + #[inline(always)] + fn to_ascii(&self) -> &'self[Ascii] { + assert!(self.is_ascii()); + unsafe{ cast::transmute(*self) } + } + + #[inline(always)] + fn is_ascii(&self) -> bool { + for self.each |b| { + if !b.is_ascii() { return false; } + } + true + } +} + +impl<'self> AsciiCast<&'self[Ascii]> for &'self str { + #[inline(always)] + fn to_ascii(&self) -> &'self[Ascii] { + assert!(self.is_ascii()); + let (p,len): (*u8, uint) = unsafe{ cast::transmute(*self) }; + unsafe{ cast::transmute((p, len - 1))} + } + + #[inline(always)] + fn is_ascii(&self) -> bool { + for self.each |b| { + if !b.is_ascii() { return false; } + } + true + } +} + +impl AsciiCast for u8 { + #[inline(always)] + fn to_ascii(&self) -> Ascii { + assert!(self.is_ascii()); + Ascii{ chr: *self } + } + + #[inline(always)] + fn is_ascii(&self) -> bool { + *self & 128 == 0u8 + } +} + +impl AsciiCast for char { + #[inline(always)] + fn to_ascii(&self) -> Ascii { + assert!(self.is_ascii()); + Ascii{ chr: *self as u8 } + } + + #[inline(always)] + fn is_ascii(&self) -> bool { + *self - ('\x7F' & *self) == '\x00' + } +} + +/// Trait for copyless casting to an ascii vector. +pub trait OwnedAsciiCast { + /// Take ownership and cast to an ascii vector without trailing zero element. + fn to_ascii_consume(self) -> ~[Ascii]; +} + +impl OwnedAsciiCast for ~[u8] { + #[inline(always)] + fn to_ascii_consume(self) -> ~[Ascii] { + assert!(self.is_ascii()); + unsafe {cast::transmute(self)} + } +} + +impl OwnedAsciiCast for ~str { + #[inline(always)] + fn to_ascii_consume(self) -> ~[Ascii] { + assert!(self.is_ascii()); + let mut s = self; + unsafe { + str::raw::pop_byte(&mut s); + cast::transmute(s) + } + } +} + +/// Trait for converting an ascii type to a string. Needed to convert `&[Ascii]` to `~str` +pub trait AsciiStr { + /// Convert to a string. + fn to_str_ascii(&self) -> ~str; + + /// Convert to vector representing a lower cased ascii string. + fn to_lower(&self) -> ~[Ascii]; + + /// Convert to vector representing a upper cased ascii string. + fn to_upper(&self) -> ~[Ascii]; + +} + +impl<'self> AsciiStr for &'self [Ascii] { + #[inline(always)] + fn to_str_ascii(&self) -> ~str { + let mut cpy = self.to_owned(); + cpy.push(0u8.to_ascii()); + unsafe {cast::transmute(cpy)} + } + + #[inline(always)] + fn to_lower(&self) -> ~[Ascii] { + self.map(|a| a.to_lower()) + } + + #[inline(always)] + fn to_upper(&self) -> ~[Ascii] { + self.map(|a| a.to_upper()) + } +} + +impl ToStrConsume for ~[Ascii] { + #[inline(always)] + fn to_str_consume(self) -> ~str { + let mut cpy = self; + cpy.push(0u8.to_ascii()); + unsafe {cast::transmute(cpy)} + } +} + +mod tests { + use super::*; + + macro_rules! v2ascii ( + ( [$($e:expr),*]) => ( [$(Ascii{chr:$e}),*]); + (~[$($e:expr),*]) => (~[$(Ascii{chr:$e}),*]); + ) + + #[test] + fn test_ascii() { + assert_eq!(65u8.to_ascii().to_byte(), 65u8); + assert_eq!(65u8.to_ascii().to_char(), 'A'); + assert_eq!('A'.to_ascii().to_char(), 'A'); + assert_eq!('A'.to_ascii().to_byte(), 65u8); + + assert_eq!('A'.to_ascii().to_lower().to_char(), 'a'); + assert_eq!('Z'.to_ascii().to_lower().to_char(), 'z'); + assert_eq!('a'.to_ascii().to_upper().to_char(), 'A'); + assert_eq!('z'.to_ascii().to_upper().to_char(), 'Z'); + + assert_eq!('@'.to_ascii().to_lower().to_char(), '@'); + assert_eq!('['.to_ascii().to_lower().to_char(), '['); + assert_eq!('`'.to_ascii().to_upper().to_char(), '`'); + assert_eq!('{'.to_ascii().to_upper().to_char(), '{'); + } + + #[test] + fn test_ascii_vec() { + assert_eq!((&[40u8, 32u8, 59u8]).to_ascii(), v2ascii!([40, 32, 59])); + assert_eq!("( ;".to_ascii(), v2ascii!([40, 32, 59])); + // FIXME: #5475 borrowchk error, owned vectors do not live long enough + // if chained-from directly + let v = ~[40u8, 32u8, 59u8]; assert_eq!(v.to_ascii(), v2ascii!([40, 32, 59])); + let v = ~"( ;"; assert_eq!(v.to_ascii(), v2ascii!([40, 32, 59])); + + assert_eq!("abCDef&?#".to_ascii().to_lower().to_str_ascii(), ~"abcdef&?#"); + assert_eq!("abCDef&?#".to_ascii().to_upper().to_str_ascii(), ~"ABCDEF&?#"); + } + + #[test] + fn test_owned_ascii_vec() { + // FIXME: #4318 Compiler crashes on moving self + //assert_eq!(~"( ;".to_ascii_consume(), v2ascii!(~[40, 32, 59])); + //assert_eq!(~[40u8, 32u8, 59u8].to_ascii_consume(), v2ascii!(~[40, 32, 59])); + //assert_eq!(~"( ;".to_ascii_consume_with_null(), v2ascii!(~[40, 32, 59, 0])); + //assert_eq!(~[40u8, 32u8, 59u8].to_ascii_consume_with_null(), + // v2ascii!(~[40, 32, 59, 0])); + } + + #[test] + fn test_ascii_to_str() { assert_eq!(v2ascii!([40, 32, 59]).to_str_ascii(), ~"( ;"); } + + #[test] + fn test_ascii_to_str_consume() { + // FIXME: #4318 Compiler crashes on moving self + //assert_eq!(v2ascii!(~[40, 32, 59]).to_str_consume(), ~"( ;"); + } + + #[test] #[should_fail] + fn test_ascii_vec_fail_u8_slice() { (&[127u8, 128u8, 255u8]).to_ascii(); } + + #[test] #[should_fail] + fn test_ascii_vec_fail_str_slice() { "zoä华".to_ascii(); } + + #[test] #[should_fail] + fn test_ascii_fail_u8_slice() { 255u8.to_ascii(); } + + #[test] #[should_fail] + fn test_ascii_fail_char_slice() { 'λ'.to_ascii(); } +} diff --git a/src/libcore/to_str.rs b/src/libcore/to_str.rs index 980d4b445d04d..7f8e6915add16 100644 --- a/src/libcore/to_str.rs +++ b/src/libcore/to_str.rs @@ -20,6 +20,12 @@ pub trait ToStr { fn to_str(&self) -> ~str; } +/// Trait for converting a type to a string, consuming it in the process. +pub trait ToStrConsume { + // Cosume and convert to a string. + fn to_str_consume(self) -> ~str; +} + impl ToStr for bool { #[inline(always)] fn to_str(&self) -> ~str { ::bool::to_str(*self) }