From 5ac90c6def09b0c5b15fe8cc846d5be4b37685fc Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Fri, 3 Apr 2026 19:01:40 +0200 Subject: [PATCH 1/2] Clarify ascii whitespace exclusion of vertical tab in the doc This especially means that for c: char, c.is_ascii() && c.is_whitespace() does **not** imply c.is_ascii_whitespace(). --- library/core/src/char/methods.rs | 3 +++ library/core/src/num/mod.rs | 3 +++ library/core/src/slice/ascii.rs | 12 +++++++++--- library/core/src/str/mod.rs | 15 ++++++++++++--- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index e9c3b040dc50..f1a1ffe98b5f 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1930,6 +1930,9 @@ pub const fn is_ascii_graphic(&self) -> bool { /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED, /// U+000C FORM FEED, or U+000D CARRIAGE RETURN. /// + /// **Warning:** Because the list above excludes U+000B VERTICAL TAB, + /// `c.is_ascii_whitespace()` is **not** equivalent to `c.is_ascii() && c.is_whitespace()`. + /// /// Rust uses the WhatWG Infra Standard's [definition of ASCII /// whitespace][infra-aw]. There are several other definitions in /// wide use. For instance, [the POSIX locale][pct] includes diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index 333e44649d8f..fd2aa06a2898 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -1064,6 +1064,9 @@ pub const fn is_ascii_graphic(&self) -> bool { /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED, /// U+000C FORM FEED, or U+000D CARRIAGE RETURN. /// + /// **Warning:** Because the list above excludes U+000B VERTICAL TAB, + /// `b.is_ascii_whitespace()` is **not** equivalent to `char::from(b).is_whitespace()`. + /// /// Rust uses the WhatWG Infra Standard's [definition of ASCII /// whitespace][infra-aw]. There are several other definitions in /// wide use. For instance, [the POSIX locale][pct] includes diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index edf058c96a52..33ecc05c5695 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -222,7 +222,9 @@ pub fn escape_ascii(&self) -> EscapeAscii<'_> { /// Returns a byte slice with leading ASCII whitespace bytes removed. /// /// 'Whitespace' refers to the definition used by - /// [`u8::is_ascii_whitespace`]. + /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes + /// the `\0x0B` byte even though it has the unicode WhiteSpace property + /// and is removed by [`str::trim_start`]. /// /// # Examples /// @@ -251,7 +253,9 @@ pub const fn trim_ascii_start(&self) -> &[u8] { /// Returns a byte slice with trailing ASCII whitespace bytes removed. /// /// 'Whitespace' refers to the definition used by - /// [`u8::is_ascii_whitespace`]. + /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes + /// the `\0x0B` byte even though it has the unicode WhiteSpace property + /// and is removed by [`str::trim_end`]. /// /// # Examples /// @@ -281,7 +285,9 @@ pub const fn trim_ascii_end(&self) -> &[u8] { /// removed. /// /// 'Whitespace' refers to the definition used by - /// [`u8::is_ascii_whitespace`]. + /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes + /// the `\0x0B` byte even though it has the unicode WhiteSpace property + /// and is removed by [`str::trim`]. /// /// # Examples /// diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 0d52bfb8c9aa..c9ac34e091f9 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -1202,6 +1202,9 @@ pub fn split_whitespace(&self) -> SplitWhitespace<'_> { /// /// This uses the same definition as [`char::is_ascii_whitespace`]. /// To split by Unicode `Whitespace` instead, use [`split_whitespace`]. + /// Note that because of this difference in definition, even if `s.is_ascii()` + /// is `true`, `s.split_ascii_whitespace()` behavior will differ from `s.split_whitespace()` + /// if `s` contains U+000B VERTICAL TAB. /// /// [`split_whitespace`]: str::split_whitespace /// @@ -2896,7 +2899,9 @@ pub const fn make_ascii_lowercase(&mut self) { /// Returns a string slice with leading ASCII whitespace removed. /// /// 'Whitespace' refers to the definition used by - /// [`u8::is_ascii_whitespace`]. + /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes + /// the U+000B code point even though it has the unicode WhiteSpace property + /// and is removed by [`str::trim_start`]. /// /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace /// @@ -2921,7 +2926,9 @@ pub const fn trim_ascii_start(&self) -> &str { /// Returns a string slice with trailing ASCII whitespace removed. /// /// 'Whitespace' refers to the definition used by - /// [`u8::is_ascii_whitespace`]. + /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes + /// the U+000B code point even though it has the unicode WhiteSpace property + /// and is removed by [`str::trim_end`]. /// /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace /// @@ -2947,7 +2954,9 @@ pub const fn trim_ascii_end(&self) -> &str { /// removed. /// /// 'Whitespace' refers to the definition used by - /// [`u8::is_ascii_whitespace`]. + /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes + /// the U+000B code point even though it has the unicode WhiteSpace property + /// and is removed by [`str::trim`]. /// /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace /// From b52a38dbf368a4db698c07808e5a6306bee48ac5 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Mon, 13 Apr 2026 11:47:54 +0200 Subject: [PATCH 2/2] Add link to Unicode White_Space property --- library/core/src/slice/ascii.rs | 12 +++++++++--- library/core/src/str/mod.rs | 9 ++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 33ecc05c5695..9db07d8abbbe 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -223,9 +223,11 @@ pub fn escape_ascii(&self) -> EscapeAscii<'_> { /// /// 'Whitespace' refers to the definition used by /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes - /// the `\0x0B` byte even though it has the unicode WhiteSpace property + /// the `\0x0B` byte even though it has the Unicode [`White_Space`] property /// and is removed by [`str::trim_start`]. /// + /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space + /// /// # Examples /// /// ``` @@ -254,9 +256,11 @@ pub const fn trim_ascii_start(&self) -> &[u8] { /// /// 'Whitespace' refers to the definition used by /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes - /// the `\0x0B` byte even though it has the unicode WhiteSpace property + /// the `\0x0B` byte even though it has the Unicode [`White_Space`] property /// and is removed by [`str::trim_end`]. /// + /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space + /// /// # Examples /// /// ``` @@ -286,9 +290,11 @@ pub const fn trim_ascii_end(&self) -> &[u8] { /// /// 'Whitespace' refers to the definition used by /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes - /// the `\0x0B` byte even though it has the unicode WhiteSpace property + /// the `\0x0B` byte even though it has the Unicode [`White_Space`] property /// and is removed by [`str::trim`]. /// + /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space + /// /// # Examples /// /// ``` diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index c9ac34e091f9..d4b77b90c1d1 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -2900,10 +2900,11 @@ pub const fn make_ascii_lowercase(&mut self) { /// /// 'Whitespace' refers to the definition used by /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes - /// the U+000B code point even though it has the unicode WhiteSpace property + /// the U+000B code point even though it has the Unicode [`White_Space`] property /// and is removed by [`str::trim_start`]. /// /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace + /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space /// /// # Examples /// @@ -2927,10 +2928,11 @@ pub const fn trim_ascii_start(&self) -> &str { /// /// 'Whitespace' refers to the definition used by /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes - /// the U+000B code point even though it has the unicode WhiteSpace property + /// the U+000B code point even though it has the Unicode [`White_Space`] property /// and is removed by [`str::trim_end`]. /// /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace + /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space /// /// # Examples /// @@ -2955,10 +2957,11 @@ pub const fn trim_ascii_end(&self) -> &str { /// /// 'Whitespace' refers to the definition used by /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes - /// the U+000B code point even though it has the unicode WhiteSpace property + /// the U+000B code point even though it has the Unicode [`White_Space`] property /// and is removed by [`str::trim`]. /// /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace + /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space /// /// # Examples ///