From 4acc5e6480e6eada1ad8d867ea31de9b998216a8 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 11 Dec 2023 16:35:43 +1100 Subject: [PATCH 1/5] Don't rebuild raw strings when unescaping. Raw strings don't have escape sequences, so for them "unescaping" just means checking for invalid chars like bare CR. Which means there is no need to rebuild them one char or byte at a time while escaping, because the unescaped version will be the same. This commit removes that rebuilding. Also, the commit changes things so that "unescaping" is unconditional for raw strings and raw byte strings. That's simpler and they're rare enough that the perf effect is negligible. --- compiler/rustc_ast/src/util/literal.rs | 73 +++++++++++--------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 50eb92125b94..76ddb6a39389 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -77,6 +77,8 @@ pub fn from_token_lit(lit: token::Lit) -> Result { // new symbol because the string in the LitKind is different to the // string in the token. let s = symbol.as_str(); + // Vanilla strings are so common we optimize for the common case where no chars + // requiring special behaviour are present. let symbol = if s.contains(['\\', '\r']) { let mut buf = String::with_capacity(s.len()); let mut error = Ok(()); @@ -104,27 +106,20 @@ pub fn from_token_lit(lit: token::Lit) -> Result { LitKind::Str(symbol, ast::StrStyle::Cooked) } token::StrRaw(n) => { - // Ditto. - let s = symbol.as_str(); - let symbol = - if s.contains('\r') { - let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| { - match unescaped_char { - Ok(c) => buf.push(c), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } - } + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can reuse the symbol on success. + let mut error = Ok(()); + unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| { + match unescaped_char { + Ok(_) => {} + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } - }); - error?; - Symbol::intern(&buf) - } else { - symbol - }; + } + } + }); + error?; LitKind::Str(symbol, ast::StrStyle::Raw(n)) } token::ByteStr => { @@ -143,25 +138,19 @@ pub fn from_token_lit(lit: token::Lit) -> Result { LitKind::ByteStr(buf.into(), StrStyle::Cooked) } token::ByteStrRaw(n) => { + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can convert the symbol directly to a `Lrc` on success. let s = symbol.as_str(); - let bytes = if s.contains('\r') { - let mut buf = Vec::with_capacity(s.len()); - let mut error = Ok(()); - unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + let mut error = Ok(()); + unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { + Ok(_) => {} + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); } - }); - error?; - buf - } else { - symbol.to_string().into_bytes() - }; - - LitKind::ByteStr(bytes.into(), StrStyle::Raw(n)) + } + }); + LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n)) } token::CStr => { let s = symbol.as_str(); @@ -187,18 +176,15 @@ pub fn from_token_lit(lit: token::Lit) -> Result { LitKind::CStr(buf.into(), StrStyle::Cooked) } token::CStrRaw(n) => { + // Raw strings have no escapes, so we only need to check for invalid chars, and we + // can convert the symbol directly to a `Lrc` on success. let s = symbol.as_str(); - let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c { Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => { error = Err(LitError::NulInCStr(span)); } - Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), - Ok(CStrUnit::Char(c)) => { - buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) - } + Ok(_) => {} Err(err) => { if err.is_fatal() { error = Err(LitError::LexerError); @@ -206,6 +192,7 @@ pub fn from_token_lit(lit: token::Lit) -> Result { } }); error?; + let mut buf = s.to_owned().into_bytes(); buf.push(0); LitKind::CStr(buf.into(), StrStyle::Raw(n)) } From 423bf4233d92dfaa4c97f04c4fc3a2368d880d24 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 11 Dec 2023 16:56:52 +1100 Subject: [PATCH 2/5] Rename the `span` args to `emit_unescape_error`. The `span` arg is described in a comment as "interior span of the literal, without quotes", which is incorrect. It's actually the span of the error part of the literal, corresponding to `range`. This commit renames `span` and `span_without_quotes` to make things clearer, and fixes the erroneous comment. --- .../src/lexer/unescape_error_reporting.rs | 75 +++++++++++-------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index b659c40b2336..2b4c2e3c2501 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -11,12 +11,12 @@ pub(crate) fn emit_unescape_error( handler: &Handler, - // interior part of the literal, without quotes + // interior part of the literal, between quotes lit: &str, - // full span of the literal, including quotes - span_with_quotes: Span, - // interior span of the literal, without quotes - span: Span, + // full span of the literal, including quotes and any prefix + full_lit_span: Span, + // span of the error part of the literal + err_span: Span, mode: Mode, // range of the error inside `lit` range: Range, @@ -24,19 +24,21 @@ pub(crate) fn emit_unescape_error( ) { debug!( "emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}", - lit, span_with_quotes, mode, range, error + lit, full_lit_span, mode, range, error ); let last_char = || { let c = lit[range.clone()].chars().next_back().unwrap(); - let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32)); + let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32)); (c, span) }; match error { EscapeError::LoneSurrogateUnicodeEscape => { - handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true }); + handler + .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true }); } EscapeError::OutOfRangeUnicodeEscape => { - handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false }); + handler + .emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false }); } EscapeError::MoreThanOneChar => { use unicode_normalization::{char::is_combining_mark, UnicodeNormalization}; @@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error( let normalized = lit.nfc().to_string(); if normalized.chars().count() == 1 { let ch = normalized.chars().next().unwrap().escape_default().to_string(); - sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized }); + sugg = Some(MoreThanOneCharSugg::NormalizedForm { + span: err_span, + ch, + normalized, + }); } let escaped_marks = rest.iter().map(|c| c.escape_default().to_string()).collect::>(); note = Some(MoreThanOneCharNote::AllCombining { - span, + span: err_span, chr: format!("{first}"), len: escaped_marks.len(), escaped_marks: escaped_marks.join(""), @@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error( .collect(); if let &[ch] = printable.as_slice() { - sugg = - Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() }); + sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting { + span: err_span, + ch: ch.to_string(), + }); note = Some(MoreThanOneCharNote::NonPrinting { - span, + span: err_span, escaped: lit.escape_default().to_string(), }); } @@ -91,13 +99,13 @@ pub(crate) fn emit_unescape_error( } let sugg = format!("{prefix}\"{escaped}\""); MoreThanOneCharSugg::Quotes { - span: span_with_quotes, + span: full_lit_span, is_byte: mode == Mode::Byte, sugg, } }); handler.emit_err(UnescapeError::MoreThanOneChar { - span: span_with_quotes, + span: full_lit_span, note, suggestion: sugg, }); @@ -105,7 +113,7 @@ pub(crate) fn emit_unescape_error( EscapeError::EscapeOnlyChar => { let (c, char_span) = last_char(); handler.emit_err(UnescapeError::EscapeOnlyChar { - span, + span: err_span, char_span, escaped_sugg: c.escape_default().to_string(), escaped_msg: escaped_char(c), @@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error( } EscapeError::BareCarriageReturn => { let double_quotes = mode.in_double_quotes(); - handler.emit_err(UnescapeError::BareCr { span, double_quotes }); + handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes }); } EscapeError::BareCarriageReturnInRawString => { assert!(mode.in_double_quotes()); - handler.emit_err(UnescapeError::BareCrRawString(span)); + handler.emit_err(UnescapeError::BareCrRawString(err_span)); } EscapeError::InvalidEscape => { let (c, span) = last_char(); @@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error( } else { if mode == Mode::Str || mode == Mode::Char { diag.span_suggestion( - span_with_quotes, + full_lit_span, "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", format!("r\"{lit}\""), Applicability::MaybeIncorrect, @@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error( diag.emit(); } EscapeError::TooShortHexEscape => { - handler.emit_err(UnescapeError::TooShortHexEscape(span)); + handler.emit_err(UnescapeError::TooShortHexEscape(err_span)); } EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => { let (c, span) = last_char(); @@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error( err.emit(); } EscapeError::OutOfRangeHexEscape => { - handler.emit_err(UnescapeError::OutOfRangeHexEscape(span)); + handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span)); } EscapeError::LeadingUnderscoreUnicodeEscape => { let (c, span) = last_char(); @@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error( }); } EscapeError::OverlongUnicodeEscape => { - handler.emit_err(UnescapeError::OverlongUnicodeEscape(span)); + handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span)); } EscapeError::UnclosedUnicodeEscape => { - handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi())); + handler + .emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi())); } EscapeError::NoBraceInUnicodeEscape => { let mut suggestion = "\\u{".to_owned(); @@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error( let (label, sub) = if suggestion_len > 0 { suggestion.push('}'); let hi = char_span.lo() + BytePos(suggestion_len as u32); - (None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion }) + (None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion }) } else { - (Some(span), NoBraceUnicodeSub::Help) + (Some(err_span), NoBraceUnicodeSub::Help) }; - handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub }); + handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub }); } EscapeError::UnicodeEscapeInByte => { - handler.emit_err(UnescapeError::UnicodeEscapeInByte(span)); + handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span)); } EscapeError::EmptyUnicodeEscape => { - handler.emit_err(UnescapeError::EmptyUnicodeEscape(span)); + handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span)); } EscapeError::ZeroChars => { - handler.emit_err(UnescapeError::ZeroChars(span)); + handler.emit_err(UnescapeError::ZeroChars(err_span)); } EscapeError::LoneSlash => { - handler.emit_err(UnescapeError::LoneSlash(span)); + handler.emit_err(UnescapeError::LoneSlash(err_span)); } EscapeError::UnskippedWhitespaceWarning => { let (c, char_span) = last_char(); handler.emit_warning(UnescapeError::UnskippedWhitespace { - span, + span: err_span, ch: escaped_char(c), char_span, }); } EscapeError::MultipleSkippedLinesWarning => { - handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span)); + handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span)); } } } From 29c5158ef5772351475d650ffbee417b36885766 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 13 Dec 2023 09:29:59 +1100 Subject: [PATCH 3/5] Adjust `Mode::is_unicode_escape_disallowed`. Some cases are unreachable. --- compiler/rustc_lexer/src/unescape.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 249126a269e2..f6216504e980 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -191,8 +191,9 @@ fn chars_should_be_ascii(self) -> bool { /// Byte literals do not allow unicode escape. fn is_unicode_escape_disallowed(self) -> bool { match self { - Byte | ByteStr | RawByteStr => true, - Char | Str | RawStr | CStr | RawCStr => false, + Byte | ByteStr => true, + Char | Str | CStr => false, + RawByteStr | RawStr | RawCStr => unreachable!(), } } From a50efe265309d0fee0661121508f77d23aecd82e Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 13 Dec 2023 09:46:21 +1100 Subject: [PATCH 4/5] Unify single-char and multi-char `CStrUnit::Char` handling. The two cases are equivalent. C string literals aren't common so there is no performance need here. --- compiler/rustc_ast/src/util/literal.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 76ddb6a39389..92b9adf1db75 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -161,7 +161,6 @@ pub fn from_token_lit(lit: token::Lit) -> Result { error = Err(LitError::NulInCStr(span)); } Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), Ok(CStrUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } From b900eb73173f416dba5619ebfc10e6e0438a4753 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 13 Dec 2023 12:50:39 +1100 Subject: [PATCH 5/5] Rename some unescaping functions. `unescape_raw_str_or_raw_byte_str` only does checking, no unescaping. And it also now handles C string literals. `unescape_raw_str` is used for all the non-raw strings. --- compiler/rustc_lexer/src/unescape.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index f6216504e980..abec12f52a6e 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -92,8 +92,8 @@ pub fn unescape_literal(src: &str, mode: Mode, callback: &mut F) let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); } - Str | ByteStr => unescape_str_common(src, mode, callback), - RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback), + Str | ByteStr => unescape_non_raw_common(src, mode, callback), + RawStr | RawByteStr => check_raw_common(src, mode, callback), CStr | RawCStr => unreachable!(), } } @@ -122,12 +122,10 @@ pub fn unescape_c_string(src: &str, mode: Mode, callback: &mut F) { match mode { CStr => { - unescape_str_common(src, mode, callback); + unescape_non_raw_common(src, mode, callback); } RawCStr => { - unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| { - callback(r, result.map(CStrUnit::Char)) - }); + check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char))); } Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), } @@ -325,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result + From>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -392,7 +390,7 @@ fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only produce errors on bare CR. -fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) +fn check_raw_common(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -400,7 +398,7 @@ fn unescape_raw_str_or_raw_byte_str(src: &str, mode: Mode, callback: &mut F) let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop // The `start` and `end` computation here matches the one in - // `unescape_str_common` for consistency, even though this function + // `unescape_non_raw_common` for consistency, even though this function // doesn't have to worry about skipping any chars. while let Some(c) = chars.next() { let start = src.len() - chars.as_str().len() - c.len_utf8();