Skip to content

Commit cee9a83

Browse files
author
Keegan McAllister
committed
Decode non-BMP hex escapes in JSON
Fixes #13064.
1 parent 58fc85d commit cee9a83

File tree

1 file changed

+65
-25
lines changed

1 file changed

+65
-25
lines changed

src/libserialize/json.rs

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ use std::io::MemWriter;
239239
use std::io;
240240
use std::num;
241241
use std::str;
242+
use std::str::ScalarValue;
242243
use std::strbuf::StrBuf;
243244

244245
use Encodable;
@@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
11291130
Ok(res)
11301131
}
11311132

1133+
fn decode_hex_escape(&mut self) -> DecodeResult<u16> {
1134+
let mut i = 0u;
1135+
let mut n = 0u16;
1136+
while i < 4u && !self.eof() {
1137+
self.bump();
1138+
n = match self.ch_or_null() {
1139+
c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)),
1140+
'a' | 'A' => n * 16_u16 + 10_u16,
1141+
'b' | 'B' => n * 16_u16 + 11_u16,
1142+
'c' | 'C' => n * 16_u16 + 12_u16,
1143+
'd' | 'D' => n * 16_u16 + 13_u16,
1144+
'e' | 'E' => n * 16_u16 + 14_u16,
1145+
'f' | 'F' => n * 16_u16 + 15_u16,
1146+
_ => return self.error(
1147+
~"invalid \\u escape (unrecognized hex)")
1148+
};
1149+
1150+
i += 1u;
1151+
}
1152+
1153+
// Error out if we didn't parse 4 digits.
1154+
if i != 4u {
1155+
return self.error(
1156+
~"invalid \\u escape (not four digits)");
1157+
}
1158+
1159+
Ok(n)
1160+
}
1161+
11321162
fn parse_str(&mut self) -> DecodeResult<~str> {
11331163
let mut escape = false;
11341164
let mut res = StrBuf::new();
@@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
11491179
'n' => res.push_char('\n'),
11501180
'r' => res.push_char('\r'),
11511181
't' => res.push_char('\t'),
1152-
'u' => {
1153-
// Parse \u1234.
1154-
let mut i = 0u;
1155-
let mut n = 0u;
1156-
while i < 4u && !self.eof() {
1157-
self.bump();
1158-
n = match self.ch_or_null() {
1159-
c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint),
1160-
'a' | 'A' => n * 16u + 10u,
1161-
'b' | 'B' => n * 16u + 11u,
1162-
'c' | 'C' => n * 16u + 12u,
1163-
'd' | 'D' => n * 16u + 13u,
1164-
'e' | 'E' => n * 16u + 14u,
1165-
'f' | 'F' => n * 16u + 15u,
1182+
'u' => match try!(self.decode_hex_escape()) {
1183+
0xDC00 .. 0xDFFF => return self.error(
1184+
~"lone trailing surrogate in hex escape"),
1185+
1186+
// Non-BMP characters are encoded as a sequence of
1187+
// two hex escapes, representing UTF-16 surrogates.
1188+
n1 @ 0xD800 .. 0xDBFF => {
1189+
let c1 = self.next_char();
1190+
let c2 = self.next_char();
1191+
match (c1, c2) {
1192+
(Some('\\'), Some('u')) => (),
11661193
_ => return self.error(
1167-
~"invalid \\u escape (unrecognized hex)")
1168-
};
1169-
1170-
i += 1u;
1171-
}
1194+
~"unexpected end of non-BMP hex escape"),
1195+
}
11721196

1173-
// Error out if we didn't parse 4 digits.
1174-
if i != 4u {
1175-
return self.error(
1176-
~"invalid \\u escape (not four digits)");
1197+
let buf = [n1, try!(self.decode_hex_escape())];
1198+
match str::utf16_items(buf.as_slice()).next() {
1199+
Some(ScalarValue(c)) => res.push_char(c),
1200+
_ => return self.error(
1201+
~"lone leading surrogate in hex escape"),
1202+
}
11771203
}
11781204

1179-
res.push_char(char::from_u32(n as u32).unwrap());
1180-
}
1205+
n => match char::from_u32(n as u32) {
1206+
Some(c) => res.push_char(c),
1207+
None => return self.error(
1208+
format!("invalid Unicode codepoint {:u}", n)),
1209+
},
1210+
},
11811211
_ => return self.error(~"invalid escape"),
11821212
}
11831213
escape = false;
@@ -2139,6 +2169,16 @@ mod tests {
21392169
assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
21402170
assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
21412171
assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));
2172+
2173+
// Non-BMP escapes. The exact error messages and positions are kind of
2174+
// arbitrary.
2175+
assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9")));
2176+
assert!(from_str("\"\\ud83d\"").is_err());
2177+
assert!(from_str("\"\\udca9\"").is_err());
2178+
assert!(from_str("\"\\ud83d\\ud83d\"").is_err());
2179+
assert!(from_str("\"\\ud83dx\"").is_err());
2180+
assert!(from_str("\"\\udca9\\udca9\"").is_err());
2181+
assert!(from_str("\"\\udca9x\"").is_err());
21422182
}
21432183

21442184
#[test]

0 commit comments

Comments
 (0)