@@ -335,21 +335,20 @@ pub struct Chars<'a> {
335
335
iter : slice:: Iter < ' a , u8 >
336
336
}
337
337
338
- // Return the initial codepoint accumulator for the first byte.
339
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
340
- // for width 3, and 3 bits for width 4
341
- macro_rules! utf8_first_byte {
342
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
343
- }
338
+ /// Return the initial codepoint accumulator for the first byte.
339
+ /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
340
+ /// for width 3, and 3 bits for width 4.
341
+ #[ inline]
342
+ fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 { ( byte & ( 0x7F >> width) ) as u32 }
344
343
345
- // return the value of $ch updated with continuation byte $byte
346
- macro_rules! utf8_acc_cont_byte {
347
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & CONT_MASK ) as u32 )
348
- }
344
+ /// Return the value of `ch` updated with continuation byte `byte`.
345
+ #[ inline]
346
+ fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 { ( ch << 6 ) | ( byte & CONT_MASK ) as u32 }
349
347
350
- macro_rules! utf8_is_cont_byte {
351
- ( $byte: expr) => ( ( $byte & !CONT_MASK ) == TAG_CONT_U8 )
352
- }
348
+ /// Checks whether the byte is a UTF-8 continuation byte (i.e. starts with the
349
+ /// bits `10`).
350
+ #[ inline]
351
+ fn utf8_is_cont_byte ( byte : u8 ) -> bool { ( byte & !CONT_MASK ) == TAG_CONT_U8 }
353
352
354
353
#[ inline]
355
354
fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
@@ -374,20 +373,20 @@ pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
374
373
// Multibyte case follows
375
374
// Decode from a byte combination out of: [[[x y] z] w]
376
375
// NOTE: Performance is sensitive to the exact formulation here
377
- let init = utf8_first_byte ! ( x, 2 ) ;
376
+ let init = utf8_first_byte ( x, 2 ) ;
378
377
let y = unwrap_or_0 ( bytes. next ( ) ) ;
379
- let mut ch = utf8_acc_cont_byte ! ( init, y) ;
378
+ let mut ch = utf8_acc_cont_byte ( init, y) ;
380
379
if x >= 0xE0 {
381
380
// [[x y z] w] case
382
381
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
383
382
let z = unwrap_or_0 ( bytes. next ( ) ) ;
384
- let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
383
+ let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
385
384
ch = init << 12 | y_z;
386
385
if x >= 0xF0 {
387
386
// [x y z w] case
388
387
// use only the lower 3 bits of `init`
389
388
let w = unwrap_or_0 ( bytes. next ( ) ) ;
390
- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
389
+ ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
391
390
}
392
391
}
393
392
@@ -410,18 +409,18 @@ pub fn next_code_point_reverse(bytes: &mut slice::Iter<u8>) -> Option<u32> {
410
409
// Decode from a byte combination out of: [x [y [z w]]]
411
410
let mut ch;
412
411
let z = unwrap_or_0 ( bytes. next_back ( ) ) ;
413
- ch = utf8_first_byte ! ( z, 2 ) ;
414
- if utf8_is_cont_byte ! ( z) {
412
+ ch = utf8_first_byte ( z, 2 ) ;
413
+ if utf8_is_cont_byte ( z) {
415
414
let y = unwrap_or_0 ( bytes. next_back ( ) ) ;
416
- ch = utf8_first_byte ! ( y, 3 ) ;
417
- if utf8_is_cont_byte ! ( y) {
415
+ ch = utf8_first_byte ( y, 3 ) ;
416
+ if utf8_is_cont_byte ( y) {
418
417
let x = unwrap_or_0 ( bytes. next_back ( ) ) ;
419
- ch = utf8_first_byte ! ( x, 4 ) ;
420
- ch = utf8_acc_cont_byte ! ( ch, y) ;
418
+ ch = utf8_first_byte ( x, 4 ) ;
419
+ ch = utf8_acc_cont_byte ( ch, y) ;
421
420
}
422
- ch = utf8_acc_cont_byte ! ( ch, z) ;
421
+ ch = utf8_acc_cont_byte ( ch, z) ;
423
422
}
424
- ch = utf8_acc_cont_byte ! ( ch, w) ;
423
+ ch = utf8_acc_cont_byte ( ch, w) ;
425
424
426
425
Some ( ch)
427
426
}
@@ -1040,7 +1039,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
1040
1039
// ASCII characters are always valid, so only large
1041
1040
// bytes need more examination.
1042
1041
if first >= 128 {
1043
- let w = UTF8_CHAR_WIDTH [ first as usize ] as usize ;
1042
+ let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1044
1043
let second = next ! ( ) ;
1045
1044
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
1046
1045
// first C2 80 last DF BF
@@ -1594,14 +1593,14 @@ impl StrExt for str {
1594
1593
i -= 1 ;
1595
1594
}
1596
1595
1597
- let mut val = s. as_bytes ( ) [ i] as u32 ;
1598
- let w = UTF8_CHAR_WIDTH [ val as usize ] as usize ;
1599
- assert ! ( ( w != 0 ) ) ;
1596
+ let first = s. as_bytes ( ) [ i] ;
1597
+ let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1598
+ assert ! ( w != 0 ) ;
1600
1599
1601
- val = utf8_first_byte ! ( val , w) ;
1602
- val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 1 ] ) ;
1603
- if w > 2 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 2 ] ) ; }
1604
- if w > 3 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 3 ] ) ; }
1600
+ let mut val = utf8_first_byte ( first , w as u32 ) ;
1601
+ val = utf8_acc_cont_byte ( val, s. as_bytes ( ) [ i + 1 ] ) ;
1602
+ if w > 2 { val = utf8_acc_cont_byte ( val, s. as_bytes ( ) [ i + 2 ] ) ; }
1603
+ if w > 3 { val = utf8_acc_cont_byte ( val, s. as_bytes ( ) [ i + 3 ] ) ; }
1605
1604
1606
1605
return CharRange { ch : unsafe { mem:: transmute ( val) } , next : i} ;
1607
1606
}
@@ -1686,16 +1685,16 @@ pub fn char_range_at_raw(bytes: &[u8], i: usize) -> (u32, usize) {
1686
1685
1687
1686
// Multibyte case is a fn to allow char_range_at to inline cleanly
1688
1687
fn multibyte_char_range_at ( bytes : & [ u8 ] , i : usize ) -> ( u32 , usize ) {
1689
- let mut val = bytes[ i] as u32 ;
1690
- let w = UTF8_CHAR_WIDTH [ val as usize ] as usize ;
1691
- assert ! ( ( w != 0 ) ) ;
1688
+ let first = bytes[ i] ;
1689
+ let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1690
+ assert ! ( w != 0 ) ;
1692
1691
1693
- val = utf8_first_byte ! ( val , w) ;
1694
- val = utf8_acc_cont_byte ! ( val, bytes[ i + 1 ] ) ;
1695
- if w > 2 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 2 ] ) ; }
1696
- if w > 3 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 3 ] ) ; }
1692
+ let mut val = utf8_first_byte ( first , w as u32 ) ;
1693
+ val = utf8_acc_cont_byte ( val, bytes[ i + 1 ] ) ;
1694
+ if w > 2 { val = utf8_acc_cont_byte ( val, bytes[ i + 2 ] ) ; }
1695
+ if w > 3 { val = utf8_acc_cont_byte ( val, bytes[ i + 3 ] ) ; }
1697
1696
1698
- return ( val, i + w) ;
1697
+ return ( val, i + w as usize ) ;
1699
1698
}
1700
1699
1701
1700
multibyte_char_range_at ( bytes, i)
0 commit comments