@@ -324,21 +324,20 @@ pub struct Chars<'a> {
324
324
iter : slice:: Iter < ' a , u8 >
325
325
}
326
326
327
- // Return the initial codepoint accumulator for the first byte.
328
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
329
- // for width 3, and 3 bits for width 4
330
- macro_rules! utf8_first_byte {
331
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
332
- }
327
+ /// Return the initial codepoint accumulator for the first byte.
328
+ /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
329
+ /// for width 3, and 3 bits for width 4.
330
+ #[ inline]
331
+ fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 { ( byte & ( 0x7F >> width) ) as u32 }
333
332
334
- // return the value of $ch updated with continuation byte $byte
335
- macro_rules! utf8_acc_cont_byte {
336
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & CONT_MASK ) as u32 )
337
- }
333
+ /// Return the value of `ch` updated with continuation byte `byte`.
334
+ #[ inline]
335
+ fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 { ( ch << 6 ) | ( byte & CONT_MASK ) as u32 }
338
336
339
- macro_rules! utf8_is_cont_byte {
340
- ( $byte: expr) => ( ( $byte & !CONT_MASK ) == TAG_CONT_U8 )
341
- }
337
+ /// Checks whether the byte is a UTF-8 continuation byte (i.e. starts with the
338
+ /// bits `10`).
339
+ #[ inline]
340
+ fn utf8_is_cont_byte ( byte : u8 ) -> bool { ( byte & !CONT_MASK ) == TAG_CONT_U8 }
342
341
343
342
#[ inline]
344
343
fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
@@ -363,20 +362,20 @@ pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
363
362
// Multibyte case follows
364
363
// Decode from a byte combination out of: [[[x y] z] w]
365
364
// NOTE: Performance is sensitive to the exact formulation here
366
- let init = utf8_first_byte ! ( x, 2 ) ;
365
+ let init = utf8_first_byte ( x, 2 ) ;
367
366
let y = unwrap_or_0 ( bytes. next ( ) ) ;
368
- let mut ch = utf8_acc_cont_byte ! ( init, y) ;
367
+ let mut ch = utf8_acc_cont_byte ( init, y) ;
369
368
if x >= 0xE0 {
370
369
// [[x y z] w] case
371
370
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
372
371
let z = unwrap_or_0 ( bytes. next ( ) ) ;
373
- let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
372
+ let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
374
373
ch = init << 12 | y_z;
375
374
if x >= 0xF0 {
376
375
// [x y z w] case
377
376
// use only the lower 3 bits of `init`
378
377
let w = unwrap_or_0 ( bytes. next ( ) ) ;
379
- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
378
+ ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
380
379
}
381
380
}
382
381
@@ -399,18 +398,18 @@ pub fn next_code_point_reverse(bytes: &mut slice::Iter<u8>) -> Option<u32> {
399
398
// Decode from a byte combination out of: [x [y [z w]]]
400
399
let mut ch;
401
400
let z = unwrap_or_0 ( bytes. next_back ( ) ) ;
402
- ch = utf8_first_byte ! ( z, 2 ) ;
403
- if utf8_is_cont_byte ! ( z) {
401
+ ch = utf8_first_byte ( z, 2 ) ;
402
+ if utf8_is_cont_byte ( z) {
404
403
let y = unwrap_or_0 ( bytes. next_back ( ) ) ;
405
- ch = utf8_first_byte ! ( y, 3 ) ;
406
- if utf8_is_cont_byte ! ( y) {
404
+ ch = utf8_first_byte ( y, 3 ) ;
405
+ if utf8_is_cont_byte ( y) {
407
406
let x = unwrap_or_0 ( bytes. next_back ( ) ) ;
408
- ch = utf8_first_byte ! ( x, 4 ) ;
409
- ch = utf8_acc_cont_byte ! ( ch, y) ;
407
+ ch = utf8_first_byte ( x, 4 ) ;
408
+ ch = utf8_acc_cont_byte ( ch, y) ;
410
409
}
411
- ch = utf8_acc_cont_byte ! ( ch, z) ;
410
+ ch = utf8_acc_cont_byte ( ch, z) ;
412
411
}
413
- ch = utf8_acc_cont_byte ! ( ch, w) ;
412
+ ch = utf8_acc_cont_byte ( ch, w) ;
414
413
415
414
Some ( ch)
416
415
}
@@ -1027,7 +1026,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
1027
1026
// ASCII characters are always valid, so only large
1028
1027
// bytes need more examination.
1029
1028
if first >= 128 {
1030
- let w = UTF8_CHAR_WIDTH [ first as usize ] as usize ;
1029
+ let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1031
1030
let second = next ! ( ) ;
1032
1031
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
1033
1032
// first C2 80 last DF BF
@@ -1580,14 +1579,14 @@ impl StrExt for str {
1580
1579
i -= 1 ;
1581
1580
}
1582
1581
1583
- let mut val = s. as_bytes ( ) [ i] as u32 ;
1584
- let w = UTF8_CHAR_WIDTH [ val as usize ] as usize ;
1585
- assert ! ( ( w != 0 ) ) ;
1582
+ let first = s. as_bytes ( ) [ i] ;
1583
+ let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1584
+ assert ! ( w != 0 ) ;
1586
1585
1587
- val = utf8_first_byte ! ( val , w) ;
1588
- val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 1 ] ) ;
1589
- if w > 2 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 2 ] ) ; }
1590
- if w > 3 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 3 ] ) ; }
1586
+ let mut val = utf8_first_byte ( first , w as u32 ) ;
1587
+ val = utf8_acc_cont_byte ( val, s. as_bytes ( ) [ i + 1 ] ) ;
1588
+ if w > 2 { val = utf8_acc_cont_byte ( val, s. as_bytes ( ) [ i + 2 ] ) ; }
1589
+ if w > 3 { val = utf8_acc_cont_byte ( val, s. as_bytes ( ) [ i + 3 ] ) ; }
1591
1590
1592
1591
return CharRange { ch : unsafe { mem:: transmute ( val) } , next : i} ;
1593
1592
}
@@ -1672,16 +1671,16 @@ pub fn char_range_at_raw(bytes: &[u8], i: usize) -> (u32, usize) {
1672
1671
1673
1672
// Multibyte case is a fn to allow char_range_at to inline cleanly
1674
1673
fn multibyte_char_range_at ( bytes : & [ u8 ] , i : usize ) -> ( u32 , usize ) {
1675
- let mut val = bytes[ i] as u32 ;
1676
- let w = UTF8_CHAR_WIDTH [ val as usize ] as usize ;
1677
- assert ! ( ( w != 0 ) ) ;
1674
+ let first = bytes[ i] ;
1675
+ let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1676
+ assert ! ( w != 0 ) ;
1678
1677
1679
- val = utf8_first_byte ! ( val , w) ;
1680
- val = utf8_acc_cont_byte ! ( val, bytes[ i + 1 ] ) ;
1681
- if w > 2 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 2 ] ) ; }
1682
- if w > 3 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 3 ] ) ; }
1678
+ let mut val = utf8_first_byte ( first , w as u32 ) ;
1679
+ val = utf8_acc_cont_byte ( val, bytes[ i + 1 ] ) ;
1680
+ if w > 2 { val = utf8_acc_cont_byte ( val, bytes[ i + 2 ] ) ; }
1681
+ if w > 3 { val = utf8_acc_cont_byte ( val, bytes[ i + 3 ] ) ; }
1683
1682
1684
- return ( val, i + w) ;
1683
+ return ( val, i + w as usize ) ;
1685
1684
}
1686
1685
1687
1686
multibyte_char_range_at ( bytes, i)
0 commit comments