Skip to content

Two character encoding fixes #13469

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 65 additions & 25 deletions src/libserialize/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ use std::io::MemWriter;
use std::io;
use std::num;
use std::str;
use std::str::ScalarValue;
use std::strbuf::StrBuf;

use Encodable;
Expand Down Expand Up @@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
Ok(res)
}

fn decode_hex_escape(&mut self) -> DecodeResult<u16> {
let mut i = 0u;
let mut n = 0u16;
while i < 4u && !self.eof() {
self.bump();
n = match self.ch_or_null() {
c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)),
'a' | 'A' => n * 16_u16 + 10_u16,
'b' | 'B' => n * 16_u16 + 11_u16,
'c' | 'C' => n * 16_u16 + 12_u16,
'd' | 'D' => n * 16_u16 + 13_u16,
'e' | 'E' => n * 16_u16 + 14_u16,
'f' | 'F' => n * 16_u16 + 15_u16,
_ => return self.error(
~"invalid \\u escape (unrecognized hex)")
};

i += 1u;
}

// Error out if we didn't parse 4 digits.
if i != 4u {
return self.error(
~"invalid \\u escape (not four digits)");
}

Ok(n)
}

fn parse_str(&mut self) -> DecodeResult<~str> {
let mut escape = false;
let mut res = StrBuf::new();
Expand All @@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
'n' => res.push_char('\n'),
'r' => res.push_char('\r'),
't' => res.push_char('\t'),
'u' => {
// Parse \u1234.
let mut i = 0u;
let mut n = 0u;
while i < 4u && !self.eof() {
self.bump();
n = match self.ch_or_null() {
c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint),
'a' | 'A' => n * 16u + 10u,
'b' | 'B' => n * 16u + 11u,
'c' | 'C' => n * 16u + 12u,
'd' | 'D' => n * 16u + 13u,
'e' | 'E' => n * 16u + 14u,
'f' | 'F' => n * 16u + 15u,
'u' => match try!(self.decode_hex_escape()) {
0xDC00 .. 0xDFFF => return self.error(
~"lone trailing surrogate in hex escape"),

// Non-BMP characters are encoded as a sequence of
// two hex escapes, representing UTF-16 surrogates.
n1 @ 0xD800 .. 0xDBFF => {
let c1 = self.next_char();
let c2 = self.next_char();
match (c1, c2) {
(Some('\\'), Some('u')) => (),
_ => return self.error(
~"invalid \\u escape (unrecognized hex)")
};

i += 1u;
}
~"unexpected end of non-BMP hex escape"),
}

// Error out if we didn't parse 4 digits.
if i != 4u {
return self.error(
~"invalid \\u escape (not four digits)");
let buf = [n1, try!(self.decode_hex_escape())];
match str::utf16_items(buf.as_slice()).next() {
Some(ScalarValue(c)) => res.push_char(c),
_ => return self.error(
~"lone leading surrogate in hex escape"),
}
}

res.push_char(char::from_u32(n as u32).unwrap());
}
n => match char::from_u32(n as u32) {
Some(c) => res.push_char(c),
None => return self.error(
format!("invalid Unicode codepoint {:u}", n)),
},
},
_ => return self.error(~"invalid escape"),
}
escape = false;
Expand Down Expand Up @@ -2139,6 +2169,16 @@ mod tests {
assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));

// Non-BMP escapes. The exact error messages and positions are kind of
// arbitrary.
assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9")));
assert!(from_str("\"\\ud83d\"").is_err());
assert!(from_str("\"\\udca9\"").is_err());
assert!(from_str("\"\\ud83d\\ud83d\"").is_err());
assert!(from_str("\"\\ud83dx\"").is_err());
assert!(from_str("\"\\udca9\\udca9\"").is_err());
assert!(from_str("\"\\udca9x\"").is_err());
}

#[test]
Expand Down
61 changes: 58 additions & 3 deletions src/libstd/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use unicode::{derived_property, property, general_category, decompose, conversio

#[cfg(test)] use str::Str;
#[cfg(test)] use strbuf::StrBuf;
#[cfg(test)] use slice::ImmutableVector;

#[cfg(not(test))] use cmp::{Eq, Ord};
#[cfg(not(test))] use default::Default;
Expand Down Expand Up @@ -560,11 +561,19 @@ pub trait Char {

/// Encodes this character as UTF-8 into the provided byte buffer.
///
/// The buffer must be at least 4 bytes long or a runtime failure will
/// The buffer must be at least 4 bytes long or a runtime failure may
/// occur.
///
/// This will then return the number of characters written to the slice.
/// This will then return the number of bytes written to the slice.
fn encode_utf8(&self, dst: &mut [u8]) -> uint;

/// Encodes this character as UTF-16 into the provided `u16` buffer.
///
/// The buffer must be at least 2 elements long or a runtime failure may
/// occur.
///
/// This will then return the number of `u16`s written to the slice.
fn encode_utf16(&self, dst: &mut [u16]) -> uint;
}

impl Char for char {
Expand Down Expand Up @@ -602,7 +611,7 @@ impl Char for char {

fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }

fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
fn encode_utf8(&self, dst: &mut [u8]) -> uint {
let code = *self as uint;
if code < MAX_ONE_B {
dst[0] = code as u8;
Expand All @@ -624,6 +633,24 @@ impl Char for char {
return 4;
}
}

fn encode_utf16(&self, dst: &mut [u16]) -> uint {
let mut ch = *self as uint;
if (ch & 0xFFFF_u) == ch {
// The BMP falls through (assuming non-surrogate, as it
// should)
assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
dst[0] = ch as u16;
1
} else {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
ch -= 0x1_0000_u;
dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2
}
}
}

#[cfg(not(test))]
Expand Down Expand Up @@ -788,3 +815,31 @@ fn test_to_str() {
let s = 't'.to_str();
assert_eq!(s, ~"t");
}

#[test]
fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) {
let mut buf = [0u8, ..4];
let n = input.encode_utf8(buf /* as mut slice! */);
assert_eq!(buf.slice_to(n), expect);
}

check('x', [0x78]);
check('\u00e9', [0xc3, 0xa9]);
check('\ua66e', [0xea, 0x99, 0xae]);
check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
}

#[test]
fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) {
let mut buf = [0u16, ..2];
let n = input.encode_utf16(buf /* as mut slice! */);
assert_eq!(buf.slice_to(n), expect);
}

check('x', [0x0078]);
check('\u00e9', [0x00e9]);
check('\ua66e', [0xa66e]);
check('\U0001f4a9', [0xd83d, 0xdca9]);
}
19 changes: 3 additions & 16 deletions src/libstd/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2555,22 +2555,9 @@ impl<'a> StrSlice<'a> for &'a str {
fn to_utf16(&self) -> ~[u16] {
let mut u = ~[];
for ch in self.chars() {
// Arithmetic with u32 literals is easier on the eyes than chars.
let mut ch = ch as u32;

if (ch & 0xFFFF_u32) == ch {
// The BMP falls through (assuming non-surrogate, as it
// should)
assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
u.push(ch as u16)
} else {
// Supplementary planes break into surrogates.
assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
ch -= 0x1_0000_u32;
let w1 = 0xD800_u16 | ((ch >> 10) as u16);
let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
u.push_all([w1, w2])
}
let mut buf = [0u16, ..2];
let n = ch.encode_utf16(buf /* as mut slice! */);
u.push_all(buf.slice_to(n));
}
u
}
Expand Down