From 37d9ea745b358a5b9f48560600841fc6619e545d Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 24 Feb 2022 16:10:36 +1100 Subject: [PATCH 1/2] Improve `scan_escape`. `scan_escape` currently has a fast path (for when the first char isn't '\\') and a slow path. This commit changes `scan_escape` so it only handles the slow path, i.e. the actual escaping code. The fast path is inlined into the two call sites. This change makes the code faster, because there is no function call overhead on the fast path. (`scan_escape` is a big function and doesn't get inlined.) This change also improves readability, because it removes a bunch of mode checks on the the fast paths. --- compiler/rustc_lexer/src/unescape.rs | 45 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index d789237e692d2..97f9588ae1ef5 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -159,26 +159,8 @@ impl Mode { } } -fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result { - if first_char != '\\' { - // Previous character was not a slash, and we don't expect it to be - // an escape-only character. - return match first_char { - '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), - '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), - _ => { - if mode.is_bytes() && !first_char.is_ascii() { - // Byte literal can't be a non-ascii character. - return Err(EscapeError::NonAsciiCharInByte); - } - Ok(first_char) - } - }; - } - - // Previous character is '\\', try to unescape it. +fn scan_escape(chars: &mut Chars<'_>, mode: Mode) -> Result { + // Previous character was '\\', unescape what follows. let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; @@ -270,9 +252,24 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result Result { + if mode.is_bytes() && !first_char.is_ascii() { + // Byte literal can't be a non-ascii character. + Err(EscapeError::NonAsciiCharInByte) + } else { + Ok(first_char) + } +} + fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = scan_escape(first_char, chars, mode)?; + let res = match first_char { + '\\' => scan_escape(chars, mode), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + _ => ascii_check(first_char, mode), + }?; if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); } @@ -303,12 +300,14 @@ where skip_ascii_whitespace(&mut chars, start, callback); continue; } - _ => scan_escape(first_char, &mut chars, mode), + _ => scan_escape(&mut chars, mode), } } '\n' => Ok('\n'), '\t' => Ok('\t'), - _ => scan_escape(first_char, &mut chars, mode), + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + _ => ascii_check(first_char, mode), }; let end = initial_len - chars.as_str().len(); callback(start..end, unescaped_char); From 44308dc3489e39958b2ce6dd297b895514b6f425 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 24 Feb 2022 16:49:37 +1100 Subject: [PATCH 2/2] Inline a hot closure in `from_lit_token`. The change looks big because `rustfmt` rearranges things, but the only real change is the inlining annotation. --- compiler/rustc_ast/src/lib.rs | 1 + compiler/rustc_ast/src/util/literal.rs | 39 +++++++++++++++----------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/compiler/rustc_ast/src/lib.rs b/compiler/rustc_ast/src/lib.rs index 84fe9ad26720e..21183121e15a0 100644 --- a/compiler/rustc_ast/src/lib.rs +++ b/compiler/rustc_ast/src/lib.rs @@ -16,6 +16,7 @@ #![feature(min_specialization)] #![recursion_limit = "256"] #![feature(slice_internals)] +#![feature(stmt_expr_attributes)] #[macro_use] extern crate rustc_macros; diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 1cc5ddfd8ee29..224afbd553fb8 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -56,25 +56,30 @@ impl LitKind { // new symbol because the string in the LitKind is different to the // string in the token. let s = symbol.as_str(); - let symbol = - if s.contains(&['\\', '\r']) { - let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(&s, Mode::Str, &mut |_, unescaped_char| { - match unescaped_char { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + let symbol = if s.contains(&['\\', '\r']) { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + // Force-inlining here is aggressive but the closure is + // called on every char in the string, so it can be + // hot in programs with many long strings. + unescape_literal( + &s, + Mode::Str, + &mut #[inline(always)] + |_, unescaped_char| match unescaped_char { + Ok(c) => buf.push(c), + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } } - }); - error?; - Symbol::intern(&buf) - } else { - symbol - }; + }, + ); + error?; + Symbol::intern(&buf) + } else { + symbol + }; LitKind::Str(symbol, ast::StrStyle::Cooked) } token::StrRaw(n) => {