Merge pull request #830 from lucacasonato/support_lone_surrogates_in_raw_value

dtolnay · web-flow · commit 7e56a406e5e3 · 2022-02-11T20:43:06.000-08:00
Allow lone surrogates in raw values
diff --git a/src/read.rs b/src/read.rs
@@ -954,34 +954,15 @@ where
 
     match ch {
         b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
-        b'u' => match tri!(read.decode_hex_escape()) {
-            0xDC00..=0xDFFF => {
-                return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-            }
-
-            // Non-BMP characters are encoded as a sequence of
-            // two hex escapes, representing UTF-16 surrogates.
-            n1 @ 0xD800..=0xDBFF => {
-                if tri!(next_or_eof(read)) != b'\\' {
-                    return error(read, ErrorCode::UnexpectedEndOfHexEscape);
-                }
-                if tri!(next_or_eof(read)) != b'u' {
-                    return error(read, ErrorCode::UnexpectedEndOfHexEscape);
-                }
-
-                let n2 = tri!(read.decode_hex_escape());
-                if n2 < 0xDC00 || n2 > 0xDFFF {
-                    return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-                }
-
-                let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
-                if char::from_u32(n).is_none() {
-                    return error(read, ErrorCode::InvalidUnicodeCodePoint);
-                }
-            }
+        b'u' => {
+            // At this point we don't care if the codepoint is valid. We just
+            // want to consume it. We don't actually know what is valid or not
+            // at this point, because that depends on if this string will
+            // ultimately be parsed into a string or a byte buffer in the "real"
+            // parse.
 
-            _ => {}
-        },
+            tri!(read.decode_hex_escape());
+        }
         _ => {
             return error(read, ErrorCode::InvalidEscape);
         }
diff --git a/tests/test.rs b/tests/test.rs
@@ -1740,6 +1740,20 @@ fn test_byte_buf_de_lone_surrogate() {
     assert!(res.is_err());
 }
 
+#[cfg(feature = "raw_value")]
+#[test]
+fn test_raw_de_lone_surrogate() {
+    use serde_json::value::RawValue;
+
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\n""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c ""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\udc01 ""#).is_ok());
+    assert!(from_str::<Box<RawValue>>(r#""\udc01\!""#).is_err());
+    assert!(from_str::<Box<RawValue>>(r#""\udc01\u""#).is_err());
+    assert!(from_str::<Box<RawValue>>(r#""\ud83c\ud83c""#).is_ok());
+}
+
 #[test]
 fn test_byte_buf_de_multiple() {
     let s: Vec<ByteBuf> = from_str(r#"["ab\nc", "cd\ne"]"#).unwrap();