@@ -239,6 +239,7 @@ use std::io::MemWriter;
239
239
use std:: io;
240
240
use std:: num;
241
241
use std:: str;
242
+ use std:: str:: ScalarValue ;
242
243
use std:: strbuf:: StrBuf ;
243
244
244
245
use Encodable ;
@@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
1129
1130
Ok ( res)
1130
1131
}
1131
1132
1133
+ fn decode_hex_escape ( & mut self ) -> DecodeResult < u16 > {
1134
+ let mut i = 0 u;
1135
+ let mut n = 0u16 ;
1136
+ while i < 4 u && !self . eof ( ) {
1137
+ self . bump ( ) ;
1138
+ n = match self . ch_or_null ( ) {
1139
+ c @ '0' .. '9' => n * 16_u16 + ( ( c as u16 ) - ( '0' as u16 ) ) ,
1140
+ 'a' | 'A' => n * 16_u16 + 10_u16 ,
1141
+ 'b' | 'B' => n * 16_u16 + 11_u16 ,
1142
+ 'c' | 'C' => n * 16_u16 + 12_u16 ,
1143
+ 'd' | 'D' => n * 16_u16 + 13_u16 ,
1144
+ 'e' | 'E' => n * 16_u16 + 14_u16 ,
1145
+ 'f' | 'F' => n * 16_u16 + 15_u16 ,
1146
+ _ => return self . error (
1147
+ ~"invalid \\u escape ( unrecognized hex) ")
1148
+ } ;
1149
+
1150
+ i += 1 u;
1151
+ }
1152
+
1153
+ // Error out if we didn't parse 4 digits.
1154
+ if i != 4 u {
1155
+ return self . error (
1156
+ ~"invalid \\u escape ( not four digits) ") ;
1157
+ }
1158
+
1159
+ Ok ( n)
1160
+ }
1161
+
1132
1162
fn parse_str( & mut self ) -> DecodeResult < ~str > {
1133
1163
let mut escape = false ;
1134
1164
let mut res = StrBuf :: new ( ) ;
@@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
1149
1179
'n' => res. push_char ( '\n' ) ,
1150
1180
'r' => res. push_char ( '\r' ) ,
1151
1181
't' => res. push_char ( '\t' ) ,
1152
- 'u' => {
1153
- // Parse \u1234.
1154
- let mut i = 0 u;
1155
- let mut n = 0 u;
1156
- while i < 4 u && !self . eof ( ) {
1157
- self . bump ( ) ;
1158
- n = match self . ch_or_null ( ) {
1159
- c @ '0' .. '9' => n * 16 u + ( c as uint ) - ( '0' as uint ) ,
1160
- 'a' | 'A' => n * 16 u + 10 u,
1161
- 'b' | 'B' => n * 16 u + 11 u,
1162
- 'c' | 'C' => n * 16 u + 12 u,
1163
- 'd' | 'D' => n * 16 u + 13 u,
1164
- 'e' | 'E' => n * 16 u + 14 u,
1165
- 'f' | 'F' => n * 16 u + 15 u,
1182
+ 'u' => match try!( self . decode_hex_escape ( ) ) {
1183
+ 0xDC00 .. 0xDFFF => return self . error (
1184
+ ~"lone trailing surrogate in hex escape") ,
1185
+
1186
+ // Non-BMP characters are encoded as a sequence of
1187
+ // two hex escapes, representing UTF-16 surrogates.
1188
+ n1 @ 0xD800 .. 0xDBFF => {
1189
+ let c1 = self . next_char ( ) ;
1190
+ let c2 = self . next_char ( ) ;
1191
+ match ( c1, c2) {
1192
+ ( Some ( '\\' ) , Some ( 'u' ) ) => ( ) ,
1166
1193
_ => return self . error (
1167
- ~"invalid \\u escape ( unrecognized hex) ")
1168
- } ;
1169
-
1170
- i += 1 u;
1171
- }
1194
+ ~"unexpected end of non-BMP hex escape") ,
1195
+ }
1172
1196
1173
- // Error out if we didn't parse 4 digits.
1174
- if i != 4 u {
1175
- return self . error (
1176
- ~"invalid \\u escape ( not four digits) ") ;
1197
+ let buf = [ n1, try!( self . decode_hex_escape ( ) ) ] ;
1198
+ match str:: utf16_items ( buf. as_slice ( ) ) . next ( ) {
1199
+ Some ( ScalarValue ( c) ) => res. push_char ( c) ,
1200
+ _ => return self . error (
1201
+ ~"lone leading surrogate in hex escape") ,
1202
+ }
1177
1203
}
1178
1204
1179
- res. push_char ( char:: from_u32 ( n as u32 ) . unwrap ( ) ) ;
1180
- }
1205
+ n => match char:: from_u32 ( n as u32 ) {
1206
+ Some ( c) => res. push_char ( c) ,
1207
+ None => return self . error (
1208
+ format ! ( "invalid Unicode codepoint {:u}" , n) ) ,
1209
+ } ,
1210
+ } ,
1181
1211
_ => return self . error ( ~"invalid escape") ,
1182
1212
}
1183
1213
escape = false ;
@@ -2139,6 +2169,16 @@ mod tests {
2139
2169
assert_eq!( from_str( " \" foo\" " ) , Ok ( String ( ~"foo")));
2140
2170
assert_eq!(from_str("\" \\ u12ab\" " ) , Ok ( String ( ~"\u12ab ") ) ) ;
2141
2171
assert_eq!( from_str( "\" \\ uAB12\" " ) , Ok ( String ( ~"\uAB12 ") ) ) ;
2172
+
2173
+ // Non-BMP escapes. The exact error messages and positions are kind of
2174
+ // arbitrary.
2175
+ assert_eq!( from_str( "\" \\ ud83d\\ udca9\" " ) , Ok ( String ( ~"\U 0001 F4A9 ")));
2176
+ assert!(from_str("\" \\ ud83d\" " ) . is_err( ) ) ;
2177
+ assert!( from_str( "\" \\ udca9\" " ) . is_err( ) ) ;
2178
+ assert!( from_str( "\" \\ ud83d\\ ud83d\" " ) . is_err( ) ) ;
2179
+ assert!( from_str( "\" \\ ud83dx\" " ) . is_err( ) ) ;
2180
+ assert!( from_str( "\" \\ udca9\\ udca9\" " ) . is_err( ) ) ;
2181
+ assert!( from_str( "\" \\ udca9x\" " ) . is_err( ) ) ;
2142
2182
}
2143
2183
2144
2184
#[ test]
0 commit comments