Rollup merge of #154765 - krtab:doc_ascii_whitespace, r=Mark-Simulacrum,WaffleLapkin

Clarify ascii whitespace exclusion of vertical tab in the doc This especially means that for `c: char`, `c.is_ascii() && c.is_whitespace()` does **not** imply `c.is_ascii_whitespace()`, which can cause bug and is highly counterintuitive.
2026-04-27 18:57:42 +03:00 · 2026-04-14 23:02:33 -04:00
parent 3f19aa5672 b52a38dbf3
commit 78a1300310
4 changed files with 36 additions and 6 deletions
@@ -1932,6 +1932,9 @@ pub const fn is_ascii_graphic(&self) -> bool {
    /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED,
    /// U+000C FORM FEED, or U+000D CARRIAGE RETURN.
    ///
+    /// **Warning:** Because the list above excludes U+000B VERTICAL TAB,
+    /// `c.is_ascii_whitespace()` is **not** equivalent to `c.is_ascii() && c.is_whitespace()`.
+    ///
    /// Rust uses the WhatWG Infra Standard's [definition of ASCII
    /// whitespace][infra-aw]. There are several other definitions in
    /// wide use. For instance, [the POSIX locale][pct] includes
@@ -1065,6 +1065,9 @@ pub const fn is_ascii_graphic(&self) -> bool {
    /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED,
    /// U+000C FORM FEED, or U+000D CARRIAGE RETURN.
    ///
+    /// **Warning:** Because the list above excludes U+000B VERTICAL TAB,
+    /// `b.is_ascii_whitespace()` is **not** equivalent to `char::from(b).is_whitespace()`.
+    ///
    /// Rust uses the WhatWG Infra Standard's [definition of ASCII
    /// whitespace][infra-aw]. There are several other definitions in
    /// wide use. For instance, [the POSIX locale][pct] includes
@@ -222,7 +222,11 @@ pub fn escape_ascii(&self) -> EscapeAscii<'_> {
    /// Returns a byte slice with leading ASCII whitespace bytes removed.
    ///
    /// 'Whitespace' refers to the definition used by
-    /// [`u8::is_ascii_whitespace`].
+    /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes
+    /// the `\0x0B` byte even though it has the Unicode [`White_Space`] property
+    /// and is removed by [`str::trim_start`].
+    ///
+    /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space
    ///
    /// # Examples
    ///
@@ -251,7 +255,11 @@ pub const fn trim_ascii_start(&self) -> &[u8] {
    /// Returns a byte slice with trailing ASCII whitespace bytes removed.
    ///
    /// 'Whitespace' refers to the definition used by
-    /// [`u8::is_ascii_whitespace`].
+    /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes
+    /// the `\0x0B` byte even though it has the Unicode [`White_Space`] property
+    /// and is removed by [`str::trim_end`].
+    ///
+    /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space
    ///
    /// # Examples
    ///
@@ -281,7 +289,11 @@ pub const fn trim_ascii_end(&self) -> &[u8] {
    /// removed.
    ///
    /// 'Whitespace' refers to the definition used by
-    /// [`u8::is_ascii_whitespace`].
+    /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes
+    /// the `\0x0B` byte even though it has the Unicode [`White_Space`] property
+    /// and is removed by [`str::trim`].
+    ///
+    /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space
    ///
    /// # Examples
    ///
@@ -1202,6 +1202,9 @@ pub fn split_whitespace(&self) -> SplitWhitespace<'_> {
    ///
    /// This uses the same definition as [`char::is_ascii_whitespace`].
    /// To split by Unicode `Whitespace` instead, use [`split_whitespace`].
+    /// Note that because of this difference in definition, even if `s.is_ascii()`
+    /// is `true`, `s.split_ascii_whitespace()` behavior will differ from `s.split_whitespace()`
+    /// if `s` contains U+000B VERTICAL TAB.
    ///
    /// [`split_whitespace`]: str::split_whitespace
    ///
@@ -2896,9 +2899,12 @@ pub const fn make_ascii_lowercase(&mut self) {
    /// Returns a string slice with leading ASCII whitespace removed.
    ///
    /// 'Whitespace' refers to the definition used by
-    /// [`u8::is_ascii_whitespace`].
+    /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes
+    /// the U+000B code point even though it has the Unicode [`White_Space`] property
+    /// and is removed by [`str::trim_start`].
    ///
    /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace
+    /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space
    ///
    /// # Examples
    ///
@@ -2921,9 +2927,12 @@ pub const fn trim_ascii_start(&self) -> &str {
    /// Returns a string slice with trailing ASCII whitespace removed.
    ///
    /// 'Whitespace' refers to the definition used by
-    /// [`u8::is_ascii_whitespace`].
+    /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes
+    /// the U+000B code point even though it has the Unicode [`White_Space`] property
+    /// and is removed by [`str::trim_end`].
    ///
    /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace
+    /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space
    ///
    /// # Examples
    ///
@@ -2947,9 +2956,12 @@ pub const fn trim_ascii_end(&self) -> &str {
    /// removed.
    ///
    /// 'Whitespace' refers to the definition used by
-    /// [`u8::is_ascii_whitespace`].
+    /// [`u8::is_ascii_whitespace`]. Importantly, this definition excludes
+    /// the U+000B code point even though it has the Unicode [`White_Space`] property
+    /// and is removed by [`str::trim`].
    ///
    /// [`u8::is_ascii_whitespace`]: u8::is_ascii_whitespace
+    /// [`White_Space`]: https://www.unicode.org/reports/tr44/#White_Space
    ///
    /// # Examples
    ///