Skip to content

Commit a2ce517

Browse files
committed
Add const UTF-8 to UTF-16 conversion macros
`wide_str!` creates a null terminated UTF-16 string whereas `utf16!` just creates a UTF-16 string without adding a null.
1 parent 30840c5 commit a2ce517

File tree

3 files changed

+116
-2
lines changed

3 files changed

+116
-2
lines changed

library/std/src/sys/pal/windows/api.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,105 @@ use core::ptr::addr_of;
3434

3535
use super::c;
3636

37+
/// Creates a null-terminated UTF-16 string from a str.
38+
macro_rules! wide_str {
39+
($str:literal) => {{
40+
const _: () = {
41+
if core::slice::memchr::memchr(0, $str.as_bytes()).is_some() {
42+
panic!("null terminated strings cannot contain interior nulls");
43+
}
44+
};
45+
utf16!(concat!($str, '\0'))
46+
}};
47+
}
48+
49+
/// Creates a UTF-16 string from a str without null termination.
50+
macro_rules! utf16 {
51+
// Note: this macro uses triple underscores to avoid const cycles
52+
($str:expr) => {{
53+
const ___UTF8: &str = $str;
54+
const ___UTF16_LEN: usize = crate::sys::pal::windows::api::utf16_len(___UTF8);
55+
const ___UTF16: [u16; ___UTF16_LEN] = crate::sys::pal::windows::api::to_utf16(___UTF8);
56+
&___UTF16
57+
}};
58+
}
59+
60+
#[cfg(test)]
61+
mod tests;
62+
63+
/// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
64+
pub const fn utf16_len(s: &str) -> usize {
65+
let s = s.as_bytes();
66+
let mut i = 0;
67+
let mut len = 0;
68+
while i < s.len() {
69+
// the length of a UTF-8 encoded code-point is given by the number of
70+
// leading ones, except in the case of ASCII.
71+
let utf8_len = match s[i].leading_ones() {
72+
0 => 1,
73+
n => n as usize,
74+
};
75+
i += utf8_len;
76+
len += if utf8_len < 4 { 1 } else { 2 };
77+
}
78+
len
79+
}
80+
81+
/// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
82+
///
83+
/// Note that this is designed for use in const contexts so is not optimized.
84+
pub const fn to_utf16<const UTF16_LEN: usize>(s: &str) -> [u16; UTF16_LEN] {
85+
let mut output = [0_u16; UTF16_LEN];
86+
let mut pos = 0;
87+
let s = s.as_bytes();
88+
let mut i = 0;
89+
while i < s.len() {
90+
match s[i].leading_ones() {
91+
// Decode UTF-8 based on its length.
92+
// See https://en.wikipedia.org/wiki/UTF-8
93+
0 => {
94+
// ASCII is the same in both encodings
95+
output[pos] = s[i] as u16;
96+
i += 1;
97+
pos += 1;
98+
}
99+
2 => {
100+
// Bits: 110xxxxx 10xxxxxx
101+
output[pos] = ((s[i] as u16 & 0b11111) << 6) | (s[i + 1] as u16 & 0b111111);
102+
i += 2;
103+
pos += 1;
104+
}
105+
3 => {
106+
// Bits: 1110xxxx 10xxxxxx 10xxxxxx
107+
output[pos] = ((s[i] as u16 & 0b1111) << 12)
108+
| ((s[i + 1] as u16 & 0b111111) << 6)
109+
| (s[i + 2] as u16 & 0b111111);
110+
i += 3;
111+
pos += 1;
112+
}
113+
4 => {
114+
// Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
115+
let mut c = ((s[i] as u32 & 0b111) << 18)
116+
| ((s[i + 1] as u32 & 0b111111) << 12)
117+
| ((s[i + 2] as u32 & 0b111111) << 6)
118+
| (s[i + 3] as u32 & 0b111111);
119+
// re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
120+
// - Subtract 0x10000 from the code point
121+
// - For the high surrogate, shift right by 10 then add 0xD800
122+
// - For the low surrogate, take the low 10 bits then add 0xDC00
123+
c -= 0x10000;
124+
output[pos] = ((c >> 10) + 0xD800) as u16;
125+
output[pos + 1] = ((c & 0b1111111111) + 0xDC00) as u16;
126+
i += 4;
127+
pos += 2;
128+
}
129+
// valid UTF-8 cannot have any other values
130+
_ => unreachable!(),
131+
}
132+
}
133+
output
134+
}
135+
37136
/// Helper method for getting the size of `T` as a u32.
38137
/// Errors at compile time if the size would overflow.
39138
///
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
macro_rules! check_utf16 {
2+
($str:literal) => {{
3+
assert!(wide_str!($str).iter().copied().eq($str.encode_utf16().chain([0])));
4+
assert!(utf16!($str).iter().copied().eq($str.encode_utf16()));
5+
}};
6+
}
7+
8+
#[test]
9+
fn test_utf16_macros() {
10+
check_utf16!("hello world");
11+
check_utf16!("€4.50");
12+
check_utf16!("𨉟呐㗂越");
13+
check_utf16!("Pchnąć w tę łódź jeża lub ośm skrzyń fig");
14+
}

library/std/src/sys/pal/windows/mod.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ pub use self::rand::hashmap_random_keys;
1212
#[macro_use]
1313
pub mod compat;
1414

15+
#[macro_use]
16+
mod api;
17+
1518
pub mod alloc;
1619
pub mod args;
1720
pub mod c;
@@ -41,8 +44,6 @@ cfg_if::cfg_if! {
4144
}
4245
}
4346

44-
mod api;
45-
4647
/// Map a Result<T, WinError> to io::Result<T>.
4748
trait IoResult<T> {
4849
fn io_result(self) -> crate::io::Result<T>;

0 commit comments

Comments
 (0)