diff --git a/src/locale/mod.rs b/src/locale/mod.rs index 9f690fa..ace5b75 100644 --- a/src/locale/mod.rs +++ b/src/locale/mod.rs @@ -3,6 +3,9 @@ use std::fmt; mod options; mod parser; +mod tinystr; + +use self::tinystr::{TinyStr4, TinyStr8}; /// A Locale object. /// @@ -76,10 +79,10 @@ mod parser; /// ``` #[derive(Debug, Default, PartialEq, Clone)] pub struct Locale { - language: Option, + language: Option, extlangs: Option>, - script: Option, - region: Option, + script: Option, + region: Option, variants: Option>, extensions: Option>>, privateuse: Vec, diff --git a/src/locale/options.rs b/src/locale/options.rs index e291eab..e1552bc 100644 --- a/src/locale/options.rs +++ b/src/locale/options.rs @@ -1,4 +1,4 @@ -use super::Locale; +use super::{Locale, TinyStr4}; use std::collections::BTreeMap; pub fn option_name_for_key(key: &str) -> &'static str { @@ -20,9 +20,11 @@ pub fn option_key_for_name(key: &str) -> &'static str { pub fn apply_options(loc: &mut Locale, opts: BTreeMap<&str, &str>) { for (key, value) in opts { match key { - "language" => loc.language = Some(value.to_owned()), - "script" => loc.script = Some(value.to_owned()), - "region" => loc.region = Some(value.to_owned()), + // TODO: should we do something other than store None on strings + // that fail representation? + "language" => loc.language = TinyStr4::new(value).ok(), + "script" => loc.script = TinyStr4::new(value).ok(), + "region" => loc.region = TinyStr4::new(value).ok(), _ => if let Some(ref mut exts) = loc.extensions { let uext = exts diff --git a/src/locale/parser.rs b/src/locale/parser.rs index 52a9381..74f83fb 100644 --- a/src/locale/parser.rs +++ b/src/locale/parser.rs @@ -1,4 +1,5 @@ use super::options; +use super::TinyStr4; use super::Locale; use std::collections::BTreeMap; use std::error::Error as ErrorTrait; @@ -61,32 +62,42 @@ pub fn ext_key_for_name(key: &str) -> &str { } } -pub fn parse_language_subtag(t: &str) -> Result { - if t.len() < 2 || t.len() > 3 || t.chars().any(|c| !c.is_ascii_alphabetic()) { +pub fn parse_language_subtag(t: &str) -> Result { + let s = TinyStr4::new(t).map_err(|_| Error::InvalidLanguage)?; + if t.len() < 2 || t.len() > 3 || !s.is_all_ascii_alpha() { return Err(Error::InvalidLanguage); } - Ok(t.to_ascii_lowercase()) + Ok(s.to_ascii_lowercase()) } -pub fn parse_script_subtag(t: &str) -> Result { - if t.len() != 4 || t.chars().any(|c| !c.is_ascii_alphabetic()) { +pub fn parse_script_subtag(t: &str) -> Result { + let s = TinyStr4::new(t).map_err(|_| Error::InvalidSubtag)?; + if t.len() != 4 || !s.is_all_ascii_alpha() { return Err(Error::InvalidSubtag); } - let mut s = t.to_ascii_lowercase(); - s[0..1].make_ascii_uppercase(); - - Ok(s) + Ok(s.to_ascii_titlecase()) } -pub fn parse_region_subtag(t: &str) -> Result { - if (t.len() == 2 && t.chars().all(|c| c.is_ascii_alphabetic())) - || (t.len() == 3 && t.chars().all(|c| c.is_ascii_digit())) - { - return Ok(t.to_ascii_uppercase()); +pub fn parse_region_subtag(t: &str) -> Result { + match t.len() { + 2 => { + let s = TinyStr4::new(t).map_err(|_| Error::InvalidSubtag)?; + if !s.is_all_ascii_alpha() { + return Err(Error::InvalidSubtag); + } + Ok(s.to_ascii_uppercase()) + } + 3 => { + if !t.chars().all(|c| c.is_ascii_digit()) { + return Err(Error::InvalidSubtag); + } + // This actually can't fail. + TinyStr4::new(t).map_err(|_| Error::InvalidSubtag) + } + _ => Err(Error::InvalidSubtag), } - Err(Error::InvalidSubtag) } pub fn parse_language_tag(t: &str) -> Result { diff --git a/src/locale/tinystr.rs b/src/locale/tinystr.rs new file mode 100644 index 0000000..b326d93 --- /dev/null +++ b/src/locale/tinystr.rs @@ -0,0 +1,330 @@ +//! A small ASCII-only bounded length string representation. + +use std::fmt; +use std::num::{NonZeroU32, NonZeroU64}; +use std::ops::Deref; +use std::ptr::copy_nonoverlapping; + +#[derive(PartialEq, Eq, Debug)] +pub enum Error { + InvalidSize, + InvalidNull, + NonAscii, +} + +/// A tiny string that is from 1 to 8 non-NUL ASCII characters. +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct TinyStr8(NonZeroU64); + +/// A tiny string that is from 1 to 4 non-NUL ASCII characters. +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct TinyStr4(NonZeroU32); + +impl TinyStr8 { + /// Create a new tiny string. + /// + /// Returns an error result if the string is not 1 to 8 characters in length, + /// contains non-ASCII, or contains an embedded NUL byte. + pub fn new(text: &str) -> Result { + let len = text.len(); + if len < 1 || len > 8 { + return Err(Error::InvalidSize); + } + unsafe { + let mut word: u64 = 0; + copy_nonoverlapping(text.as_ptr(), &mut word as *mut u64 as *mut u8, len); + let mask = 0x80808080_80808080u64 >> (8 * (8 - len)); + // TODO: could do this with #cfg(target_endian), but this is clearer and + // more confidence-inspiring. + let mask = mask.to_le(); + if (word & mask) != 0 { + return Err(Error::NonAscii); + } + if ((mask - word) & mask) != 0 { + return Err(Error::InvalidNull); + } + Ok(TinyStr8(NonZeroU64::new_unchecked(word))) + } + } + + /// Dereference to string slice. + #[inline] + pub fn as_str(&self) -> &str { + self.deref() + } + + pub fn to_ascii_uppercase(self) -> TinyStr8 { + let word = self.0.get(); + let result = word & + !( + ( + (word + 0x1f1f1f1f_1f1f1f1f) & + !(word + 0x05050505_05050505) & + 0x80808080_80808080 + ) >> 2 + ); + unsafe { TinyStr8(NonZeroU64::new_unchecked(result)) } + } + + pub fn to_ascii_lowercase(self) -> TinyStr8 { + let word = self.0.get(); + let result = word | + ( + ( + (word + 0x3f3f3f3f_3f3f3f3f) & + !(word + 0x25252525_25252525) & + 0x80808080_80808080 + ) >> 2 + ); + unsafe { TinyStr8(NonZeroU64::new_unchecked(result)) } + } + + /// Determine whether string is all ASCII alphabetical characters. + pub fn is_all_ascii_alpha(self) -> bool { + let word = self.0.get(); + let mask = (word + 0x7f7f7f7f_7f7f7f7f) & 0x80808080_80808080; + let lower = word | 0x20202020_20202020; + ( + ( + !(lower + 0x1f1f1f1f_1f1f1f1f) | + (lower + 0x05050505_05050505) + ) & mask + ) == 0 + } +} + +impl Deref for TinyStr8 { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + // Again, could use #cfg to hand-roll a big-endian implementation. + let word = self.0.get().to_le(); + let len = (8 - word.leading_zeros() / 8) as usize; + unsafe { + let slice = core::slice::from_raw_parts(&self.0 as *const _ as *const u8, len); + std::str::from_utf8_unchecked(slice) + } + } +} + +impl fmt::Display for TinyStr8 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.deref()) + } +} + +impl fmt::Debug for TinyStr8 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self.deref()) + } +} + +unsafe fn make_4byte_str(text: &str, len: usize, mask: u32) -> Result { + // Mask is always supplied as little-endian. + let mask = mask.to_le(); + let mut word: u32 = 0; + copy_nonoverlapping(text.as_ptr(), &mut word as *mut u32 as *mut u8, len); + if (word & mask) != 0 { + return Err(Error::NonAscii); + } + if ((mask - word) & mask) != 0 { + return Err(Error::InvalidNull); + } + Ok(NonZeroU32::new_unchecked(word)) +} + +impl TinyStr4 { + /// Create a new tiny string. + /// + /// Returns an error result if the string is not 1 to 4 characters in length, + /// contains non-ASCII, or contains an embedded NUL byte. + pub fn new(text: &str) -> Result { + unsafe { + match text.len() { + 1 => make_4byte_str(text, 1, 0x80).map(TinyStr4), + 2 => make_4byte_str(text, 2, 0x8080).map(TinyStr4), + 3 => make_4byte_str(text, 3, 0x808080).map(TinyStr4), + 4 => make_4byte_str(text, 4, 0x80808080).map(TinyStr4), + _ => Err(Error::InvalidSize), + } + } + } + + /// Dereference to string slice. + #[inline] + pub fn as_str(&self) -> &str { + self.deref() + } + + pub fn to_ascii_uppercase(self) -> TinyStr4 { + let word = self.0.get(); + let result = word & + !( + ( + (word + 0x1f1f1f1f) & + !(word + 0x05050505) & + 0x80808080 + ) >> 2 + ); + unsafe { TinyStr4(NonZeroU32::new_unchecked(result)) } + } + + pub fn to_ascii_lowercase(self) -> TinyStr4 { + let word = self.0.get(); + let result = word | + ( + ( + (word + 0x3f3f3f3f) & + !(word + 0x25252525) & + 0x80808080 + ) >> 2 + ); + unsafe { TinyStr4(NonZeroU32::new_unchecked(result)) } + } + + /// Makes the string all lowercase except for the first character, + /// which is made uppercase. + pub fn to_ascii_titlecase(self) -> TinyStr4 { + let word = self.0.get().to_le(); + let mask = ( + (word + 0x3f3f3f1f) & + !(word + 0x25252505) & + 0x80808080 + ) >> 2; + let result = (word | mask) & !(0x20 & mask); + unsafe { TinyStr4(NonZeroU32::new_unchecked(result.to_le())) } + } + + /// Determine whether string is all ASCII alphabetical characters. + pub fn is_all_ascii_alpha(self) -> bool { + let word = self.0.get(); + let mask = (word + 0x7f7f7f7f) & 0x80808080; + let lower = word | 0x20202020; + ( + ( + !(lower + 0x1f1f1f1f) | + (lower + 0x05050505) + ) & mask + ) == 0 + } +} + +impl Deref for TinyStr4 { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + // Again, could use #cfg to hand-roll a big-endian implementation. + let word = self.0.get().to_le(); + let len = (4 - word.leading_zeros() / 8) as usize; + unsafe { + let slice = core::slice::from_raw_parts(&self.0 as *const _ as *const u8, len); + std::str::from_utf8_unchecked(slice) + } + } +} + +impl fmt::Display for TinyStr4 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.deref()) + } +} + +impl fmt::Debug for TinyStr4 { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", self.deref()) + } +} + +#[cfg(test)] +mod tests { + use super::{Error, TinyStr4, TinyStr8}; + use std::ops::Deref; + + #[test] + fn tiny4_basic() { + let s = TinyStr4::new("abc").unwrap(); + assert_eq!(s.deref(), "abc"); + } + + #[test] + fn tiny4_size() { + assert_eq!(TinyStr4::new(""), Err(Error::InvalidSize)); + assert!(TinyStr4::new("1").is_ok()); + assert!(TinyStr4::new("12").is_ok()); + assert!(TinyStr4::new("123").is_ok()); + assert!(TinyStr4::new("1234").is_ok()); + assert_eq!(TinyStr4::new("12345"), Err(Error::InvalidSize)); + assert_eq!(TinyStr4::new("123456789"), Err(Error::InvalidSize)); + } + + #[test] + fn tiny4_null() { + assert_eq!(TinyStr4::new("a\u{0}b"), Err(Error::InvalidNull)); + } + + #[test] + fn tiny4_nonascii() { + assert_eq!(TinyStr4::new("\u{4000}"), Err(Error::NonAscii)); + } + + #[test] + fn tiny4_alpha() { + let s = TinyStr4::new("@aZ[").unwrap(); + assert!(!s.is_all_ascii_alpha()); + assert_eq!(s.to_ascii_uppercase().as_str(), "@AZ["); + assert_eq!(s.to_ascii_lowercase().as_str(), "@az["); + + assert!(TinyStr4::new("abYZ").unwrap().is_all_ascii_alpha()); + } + + #[test] + fn tiny4_titlecase() { + assert_eq!(TinyStr4::new("abcd").unwrap().to_ascii_titlecase().as_str(), "Abcd"); + assert_eq!(TinyStr4::new("ABCD").unwrap().to_ascii_titlecase().as_str(), "Abcd"); + assert_eq!(TinyStr4::new("aBCD").unwrap().to_ascii_titlecase().as_str(), "Abcd"); + assert_eq!(TinyStr4::new("A123").unwrap().to_ascii_titlecase().as_str(), "A123"); + assert_eq!(TinyStr4::new("123a").unwrap().to_ascii_titlecase().as_str(), "123a"); + } + + #[test] + fn tiny8_basic() { + let s = TinyStr8::new("abcde").unwrap(); + assert_eq!(s.deref(), "abcde"); + } + + #[test] + fn tiny8_size() { + assert_eq!(TinyStr8::new(""), Err(Error::InvalidSize)); + assert!(TinyStr8::new("1").is_ok()); + assert!(TinyStr8::new("12").is_ok()); + assert!(TinyStr8::new("123").is_ok()); + assert!(TinyStr8::new("1234").is_ok()); + assert!(TinyStr8::new("12345").is_ok()); + assert!(TinyStr8::new("123456").is_ok()); + assert!(TinyStr8::new("1234567").is_ok()); + assert!(TinyStr8::new("12345678").is_ok()); + assert_eq!(TinyStr8::new("123456789"), Err(Error::InvalidSize)); + } + + #[test] + fn tiny8_null() { + assert_eq!(TinyStr8::new("a\u{0}b"), Err(Error::InvalidNull)); + } + + #[test] + fn tiny8_nonascii() { + assert_eq!(TinyStr8::new("\u{4000}"), Err(Error::NonAscii)); + } + + #[test] + fn tiny8_alpha() { + let s = TinyStr8::new("@abcXYZ[").unwrap(); + assert!(!s.is_all_ascii_alpha()); + assert_eq!(s.to_ascii_uppercase().as_str(), "@ABCXYZ["); + assert_eq!(s.to_ascii_lowercase().as_str(), "@abcxyz["); + + assert!(TinyStr8::new("abcXYZ").unwrap().is_all_ascii_alpha()); + } +}