Rollup merge of #155028 - Brace1000:whitespace-tests, r=chenyukang

tests: add whitespace tests for vertical tab behavior This PR adds two small tests to highlight how vertical tab (\x0B) is handled differently across Rust's whitespace definitions. The Rust lexer treats vertical tab as whitespace (Unicode Pattern_White_Space), while `split_ascii_whitespace` follows the WhatWG Infra Standard and does not include vertical tab. These tests make that difference visible and easier to understand. See: https://github.com/rustfoundation/interop-initiative/issues/53
2026-04-27 18:57:42 +03:00 · 2026-04-21 08:22:14 -04:00
parent 93637f398f c2c486a3c0
commit 2c1f01728d
5 changed files with 118 additions and 0 deletions
@@ -1588,6 +1588,21 @@ Tests on various well-formedness checks, e.g. [Type-checking normal functions](h

 Tests on `where` clauses. See [Where clauses | Reference](https://doc.rust-lang.org/reference/items/generics.html#where-clauses).

+## `tests/ui/whitespace/`
+
+Tests for whitespace handling in the Rust lexer. The Rust language
+defines whitespace as Unicode Pattern_White_Space, which is not the
+same as what the standard library gives you:
+
+- `is_ascii_whitespace` follows the WhatWG Infra Standard and skips
+  vertical tab (`\x0B`)
+- `is_whitespace` matches Unicode White_Space, which is a broader set
+
+These tests make that gap visible and check that the lexer accepts
+all 11 Pattern_White_Space characters correctly.
+
+See: https://github.com/rustfoundation/interop-initiative/issues/53
+
 ## `tests/ui/windows-subsystem/`: `#![windows_subsystem = ""]`

 See [the `windows_subsystem` attribute](https://doc.rust-lang.org/reference/runtime.html#the-windows_subsystem-attribute).
@@ -0,0 +1,22 @@
+//@ run-pass
+// This test checks that split_ascii_whitespace does NOT split on
+// vertical tab (\x0B), because the standard library uses the WhatWG
+// Infra Standard definition of ASCII whitespace, which excludes
+// vertical tab.
+//
+// See: https://github.com/rust-lang/rust-project-goals/issues/53
+
+fn main() {
+    let s = "a\x0Bb";
+
+    let parts: Vec<&str> = s.split_ascii_whitespace().collect();
+
+    assert_eq!(parts.len(), 1,
+        "vertical tab should not be treated as ASCII whitespace");
+
+    let s2 = "a b";
+    let parts2: Vec<&str> = s2.split_ascii_whitespace().collect();
+    assert_eq!(parts2.len(), 2,
+        "regular space should split correctly");
+
+}
@@ -0,0 +1,13 @@
+// This test ensures that the Rust lexer rejects invalid whitespace
+// characters such as ZERO WIDTH SPACE.
+
+//@ check-fail
+
+fn main() {
+    let x = 5;
+    let y = 10;
+
+    let a=x + y;
+    //~^ ERROR unknown start of token
+    //~| HELP invisible characters like
+}
@@ -0,0 +1,10 @@
+error: unknown start of token: \u{200b}
+  --> $DIR/invalid_whitespace.rs:10:11
+   |
+LL |     let a=x + y;
+   |           ^
+   |
+   = help: invisible characters like '\u{200b}' are not usually visible in text editors
+
+error: aborting due to 1 previous error
+
@@ -0,0 +1,58 @@
+//@ run-pass
+// ignore-tidy-tab
+//
+// Tests that the Rust lexer accepts Unicode Pattern_White_Space characters.
+//
+// Worth noting: the Rust reference defines whitespace as Pattern_White_Space,
+// which is not the same as what is_ascii_whitespace or is_whitespace give you.
+//
+// is_ascii_whitespace follows WhatWG and skips vertical tab (\x0B).
+// is_whitespace uses Unicode White_Space, which is a broader set.
+//
+// The 11 characters that actually count as whitespace in Rust source:
+//   \x09 \x0A \x0B \x0C \x0D \x20 \u{85} \u{200E} \u{200F} \u{2028} \u{2029}
+//
+// Ref: https://github.com/rustfoundation/interop-initiative/issues/53
+
+#[rustfmt::skip]
+fn main() {
+    // tab (\x09) between let and the name
+    let	_ws1 = 1_i32;
+
+    // vertical tab (\x0B) between let and the name
+    // this is the one is_ascii_whitespace gets wrong
+    let_ws2 = 2_i32;
+
+    // form feed (\x0C) between let and the name
+    let_ws3 = 3_i32;
+
+    // plain space (\x20), here just so every character is represented
+    let _ws4 = 4_i32;
+
+    // NEL (\u{85}) between let and the name
+    let_ws5 = 5_i32;
+
+    // left-to-right mark (\u{200E}) between let and the name
+    let‎_ws6 = 6_i32;
+
+    // right-to-left mark (\u{200F}) between let and the name
+    let‏_ws7 = 7_i32;
+
+    // \x0A, \x0D, \u{2028}, \u{2029} are also Pattern_White_Space but they
+    // act as line endings, so you can't stick them in the middle of a statement.
+    // The lexer still handles them correctly at line boundaries.
+
+    // These are Unicode White_Space but NOT Pattern_White_Space:
+    //   \u{A0}   no-break space       \u{1680} ogham space mark
+    //   \u{2000} en quad              \u{2001} em quad
+    //   \u{2002} en space             \u{2003} em space
+    //   \u{2004} three-per-em space   \u{2005} four-per-em space
+    //   \u{2006} six-per-em space     \u{2007} figure space
+    //   \u{2008} punctuation space    \u{2009} thin space
+    //   \u{200A} hair space           \u{202F} narrow no-break space
+    //   \u{205F} medium math space    \u{3000} ideographic space
+
+    // add them up so the compiler doesn't complain about unused variables
+    let _sum = _ws1 + _ws2 + _ws3 + _ws4 + _ws5 + _ws6 + _ws7;
+    println!("{}", _sum);
+}