@@ -34,6 +34,105 @@ use core::ptr::addr_of;
34
34
35
35
use super :: c;
36
36
37
+ /// Creates a null-terminated UTF-16 string from a str.
38
+ macro_rules! wide_str {
39
+ ( $str: literal) => { {
40
+ const _: ( ) = {
41
+ if core:: slice:: memchr:: memchr( 0 , $str. as_bytes( ) ) . is_some( ) {
42
+ panic!( "null terminated strings cannot contain interior nulls" ) ;
43
+ }
44
+ } ;
45
+ utf16!( concat!( $str, '\0' ) )
46
+ } } ;
47
+ }
48
+
49
+ /// Creates a UTF-16 string from a str without null termination.
50
+ macro_rules! utf16 {
51
+ // Note: this macro uses triple underscores to avoid const cycles
52
+ ( $str: expr) => { {
53
+ const ___UTF8: & str = $str;
54
+ const ___UTF16_LEN: usize = crate :: sys:: pal:: windows:: api:: utf16_len( ___UTF8) ;
55
+ const ___UTF16: [ u16 ; ___UTF16_LEN] = crate :: sys:: pal:: windows:: api:: to_utf16( ___UTF8) ;
56
+ & ___UTF16
57
+ } } ;
58
+ }
59
+
60
+ #[ cfg( test) ]
61
+ mod tests;
62
+
63
+ /// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
64
+ pub const fn utf16_len ( s : & str ) -> usize {
65
+ let s = s. as_bytes ( ) ;
66
+ let mut i = 0 ;
67
+ let mut len = 0 ;
68
+ while i < s. len ( ) {
69
+ // the length of a UTF-8 encoded code-point is given by the number of
70
+ // leading ones, except in the case of ASCII.
71
+ let utf8_len = match s[ i] . leading_ones ( ) {
72
+ 0 => 1 ,
73
+ n => n as usize ,
74
+ } ;
75
+ i += utf8_len;
76
+ len += if utf8_len < 4 { 1 } else { 2 } ;
77
+ }
78
+ len
79
+ }
80
+
81
+ /// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
82
+ ///
83
+ /// Note that this is designed for use in const contexts so is not optimized.
84
+ pub const fn to_utf16 < const UTF16_LEN : usize > ( s : & str ) -> [ u16 ; UTF16_LEN ] {
85
+ let mut output = [ 0_u16 ; UTF16_LEN ] ;
86
+ let mut pos = 0 ;
87
+ let s = s. as_bytes ( ) ;
88
+ let mut i = 0 ;
89
+ while i < s. len ( ) {
90
+ match s[ i] . leading_ones ( ) {
91
+ // Decode UTF-8 based on its length.
92
+ // See https://en.wikipedia.org/wiki/UTF-8
93
+ 0 => {
94
+ // ASCII is the same in both encodings
95
+ output[ pos] = s[ i] as u16 ;
96
+ i += 1 ;
97
+ pos += 1 ;
98
+ }
99
+ 2 => {
100
+ // Bits: 110xxxxx 10xxxxxx
101
+ output[ pos] = ( ( s[ i] as u16 & 0b11111 ) << 6 ) | ( s[ i + 1 ] as u16 & 0b111111 ) ;
102
+ i += 2 ;
103
+ pos += 1 ;
104
+ }
105
+ 3 => {
106
+ // Bits: 1110xxxx 10xxxxxx 10xxxxxx
107
+ output[ pos] = ( ( s[ i] as u16 & 0b1111 ) << 12 )
108
+ | ( ( s[ i + 1 ] as u16 & 0b111111 ) << 6 )
109
+ | ( s[ i + 2 ] as u16 & 0b111111 ) ;
110
+ i += 3 ;
111
+ pos += 1 ;
112
+ }
113
+ 4 => {
114
+ // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
115
+ let mut c = ( ( s[ i] as u32 & 0b111 ) << 18 )
116
+ | ( ( s[ i + 1 ] as u32 & 0b111111 ) << 12 )
117
+ | ( ( s[ i + 2 ] as u32 & 0b111111 ) << 6 )
118
+ | ( s[ i + 3 ] as u32 & 0b111111 ) ;
119
+ // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
120
+ // - Subtract 0x10000 from the code point
121
+ // - For the high surrogate, shift right by 10 then add 0xD800
122
+ // - For the low surrogate, take the low 10 bits then add 0xDC00
123
+ c -= 0x10000 ;
124
+ output[ pos] = ( ( c >> 10 ) + 0xD800 ) as u16 ;
125
+ output[ pos + 1 ] = ( ( c & 0b1111111111 ) + 0xDC00 ) as u16 ;
126
+ i += 4 ;
127
+ pos += 2 ;
128
+ }
129
+ // valid UTF-8 cannot have any other values
130
+ _ => unreachable ! ( ) ,
131
+ }
132
+ }
133
+ output
134
+ }
135
+
37
136
/// Helper method for getting the size of `T` as a u32.
38
137
/// Errors at compile time if the size would overflow.
39
138
///
0 commit comments