Skip to content

Commit 580a634

Browse files
Generate tests for Unicode property data
Currently the test file takes a while to compile -- 30 seconds or so -- but since it's not going to be committed, and is just for local testing, that seems fine.
1 parent 7c4baed commit 580a634

File tree

1 file changed

+101
-0
lines changed
  • src/tools/unicode-table-generator/src

1 file changed

+101
-0
lines changed

src/tools/unicode-table-generator/src/main.rs

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,17 @@ fn main() {
152152
std::process::exit(1);
153153
});
154154

155+
// Optional test path, which is a Rust source file testing that the unicode
156+
// property lookups are correct.
157+
let test_path = std::env::args().nth(2);
158+
155159
let unicode_data = load_data();
156160
let ranges_by_property = &unicode_data.ranges;
157161

162+
if let Some(path) = test_path {
163+
std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
164+
}
165+
158166
let mut total_bytes = 0;
159167
let mut modules = Vec::new();
160168
for (property, ranges) in ranges_by_property {
@@ -236,6 +244,99 @@ fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
236244
out
237245
}
238246

247+
fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String {
248+
let mut s = String::new();
249+
s.push_str("#![allow(incomplete_features, unused)]\n");
250+
s.push_str("#![feature(const_generics)]\n\n");
251+
s.push_str(&format!("#[path = \"{}\"]\n", data_path));
252+
s.push_str("mod unicode_data;\n\n");
253+
254+
s.push_str(
255+
"
256+
#[inline(always)]
257+
fn range_search<const N: usize, const CHUNK_SIZE: usize, const N1: usize, const N2: usize>(
258+
needle: u32,
259+
chunk_idx_map: &[u8; N],
260+
(last_chunk_idx, last_chunk_mapping): (u16, u8),
261+
bitset_chunk_idx: &[[u8; CHUNK_SIZE]; N1],
262+
bitset: &[u64; N2],
263+
) -> bool {
264+
let bucket_idx = (needle / 64) as usize;
265+
let chunk_map_idx = bucket_idx / CHUNK_SIZE;
266+
let chunk_piece = bucket_idx % CHUNK_SIZE;
267+
let chunk_idx = if chunk_map_idx >= N {
268+
if chunk_map_idx == last_chunk_idx as usize {
269+
last_chunk_mapping
270+
} else {
271+
return false;
272+
}
273+
} else {
274+
chunk_idx_map[chunk_map_idx]
275+
};
276+
let idx = bitset_chunk_idx[(chunk_idx as usize)][chunk_piece];
277+
let word = bitset[(idx as usize)];
278+
(word & (1 << (needle % 64) as u64)) != 0
279+
}
280+
",
281+
);
282+
283+
s.push_str("\nfn main() {\n");
284+
285+
for (property, ranges) in ranges {
286+
s.push_str(&format!(r#" println!("Testing {}");"#, property));
287+
s.push('\n');
288+
s.push_str(&format!(" {}();\n", property.to_lowercase()));
289+
let mut is_true = Vec::new();
290+
let mut is_false = Vec::new();
291+
for ch_num in 0..(std::char::MAX as u32) {
292+
if std::char::from_u32(ch_num).is_none() {
293+
continue;
294+
}
295+
if ranges.iter().any(|r| r.contains(&ch_num)) {
296+
is_true.push(ch_num);
297+
} else {
298+
is_false.push(ch_num);
299+
}
300+
}
301+
302+
s.push_str(&format!(" fn {}() {{\n", property.to_lowercase()));
303+
generate_asserts(&mut s, property, &is_true, true);
304+
generate_asserts(&mut s, property, &is_false, false);
305+
s.push_str(" }\n\n");
306+
}
307+
308+
s.push_str("}");
309+
s
310+
}
311+
312+
fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) {
313+
for range in ranges_from_set(points) {
314+
if range.end == range.start + 1 {
315+
s.push_str(&format!(
316+
" assert!({}unicode_data::{}::lookup(std::char::from_u32({}).unwrap()), \"{}\");\n",
317+
if truthy { "" } else { "!" },
318+
property.to_lowercase(),
319+
range.start,
320+
std::char::from_u32(range.start).unwrap(),
321+
));
322+
} else {
323+
s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
324+
s.push_str(&format!(
325+
" assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
326+
if truthy { "" } else { "!" },
327+
property.to_lowercase(),
328+
));
329+
s.push_str(" }\n");
330+
}
331+
}
332+
}
333+
334+
fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
335+
let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::<Vec<Range<u32>>>();
336+
merge_ranges(&mut ranges);
337+
ranges
338+
}
339+
239340
fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
240341
loop {
241342
let mut new_ranges = Vec::new();

0 commit comments

Comments
 (0)