From e9b36bd374e4b00ad48ce5d6694c2ebbf256b121 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Wed, 1 Apr 2026 21:25:42 -0400 Subject: [PATCH] `core::unicode`: Replace `Cased` table with `Lt` Shaves off 368 bytes from the total size of all Unicode data tables. --- library/core/src/char/methods.rs | 8 +- library/core/src/unicode/mod.rs | 2 +- library/core/src/unicode/unicode_data.rs | 90 ++++++++----------- library/coretests/tests/unicode.rs | 6 +- library/coretests/tests/unicode/test_data.rs | 61 ++----------- src/tools/unicode-table-generator/src/main.rs | 2 +- 6 files changed, 51 insertions(+), 118 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 46d48afbf5a1..83c86488e31a 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -810,7 +810,7 @@ pub fn is_cased(self) -> bool { match self { 'a'..='z' | 'A'..='Z' => true, '\0'..='\u{A9}' => false, - _ => unicode::Cased(self), + _ => unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self), } } @@ -840,10 +840,10 @@ pub fn case(self) -> Option { 'a'..='z' => Some(CharCase::Lower), 'A'..='Z' => Some(CharCase::Upper), '\0'..='\u{A9}' => None, - _ if !unicode::Cased(self) => None, _ if unicode::Lowercase(self) => Some(CharCase::Lower), _ if unicode::Uppercase(self) => Some(CharCase::Upper), - _ => Some(CharCase::Title), + _ if unicode::Lt(self) => Some(CharCase::Title), + _ => None, } } @@ -919,7 +919,7 @@ pub const fn is_lowercase(self) -> bool { pub fn is_titlecase(self) -> bool { match self { '\0'..='\u{01C4}' => false, - _ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(), + _ => unicode::Lt(self), } } diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 22a1166fdf16..8b2c526a0887 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -9,9 +9,9 @@ #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub(crate) use unicode_data::cased::lookup as Cased; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; +pub(crate) use unicode_data::lt::lookup as Lt; pub(crate) use unicode_data::n::lookup as N; pub(crate) use unicode_data::uppercase::lookup as Uppercase; pub(crate) use unicode_data::white_space::lookup as White_Space; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index f602cd5c5b6b..83d380805184 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,16 +1,16 @@ //! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! // Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist // Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist -// Cased : 401 bytes, 4580 codepoints in 156 ranges (U+0000AA - U+01F18A) using skiplist // Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist // Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset +// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist // N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist // Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset // White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading // to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT // to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT // to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT -// Total : 9997 bytes +// Total : 9629 bytes #[inline(always)] const fn bitset_search< @@ -337,59 +337,6 @@ fn lookup_slow(c: char) -> bool { } } -#[rustfmt::skip] -pub mod cased { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [ - ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024), - ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958), - ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264), - ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824), - ShortOffsetRunHeader::new(177, 64256), ShortOffsetRunHeader::new(183, 65313), - ShortOffsetRunHeader::new(187, 66560), ShortOffsetRunHeader::new(191, 67456), - ShortOffsetRunHeader::new(213, 68736), ShortOffsetRunHeader::new(221, 71840), - ShortOffsetRunHeader::new(229, 93760), ShortOffsetRunHeader::new(231, 119808), - ShortOffsetRunHeader::new(237, 120486), ShortOffsetRunHeader::new(274, 122624), - ShortOffsetRunHeader::new(297, 122928), ShortOffsetRunHeader::new(303, 125184), - ShortOffsetRunHeader::new(305, 127280), ShortOffsetRunHeader::new(307, 1241482), - ]; - static OFFSETS: [u8; 313] = [ - 170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 2, 35, 7, 2, 30, 5, 96, 1, 42, 4, - 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1, - 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8, - 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, - 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4, - 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132, - 102, 3, 4, 1, 77, 20, 6, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, - 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, - 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 32, 25, 2, 25, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, - 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, - 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0, - 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, - ]; - #[inline] - pub fn lookup(c: char) -> bool { - debug_assert!(!c.is_ascii()); - (c as u32) >= 0xaa && lookup_slow(c) - } - - #[inline(never)] - fn lookup_slow(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } - } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } - } -} - #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; @@ -574,6 +521,39 @@ pub const fn lookup(c: char) -> bool { } } +#[rustfmt::skip] +pub mod lt { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [ + ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072), + ShortOffsetRunHeader::new(9, 1122301), + ]; + static OFFSETS: [u8; 21] = [ + 0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0x1c5 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + #[rustfmt::skip] pub mod n { use super::ShortOffsetRunHeader; diff --git a/library/coretests/tests/unicode.rs b/library/coretests/tests/unicode.rs index a8a221db8f95..12eed25a1fea 100644 --- a/library/coretests/tests/unicode.rs +++ b/library/coretests/tests/unicode.rs @@ -60,9 +60,9 @@ fn case_ignorable() { #[test] #[cfg_attr(miri, ignore)] // Miri is too slow -fn cased() { - test_boolean_property(test_data::CASED, unicode_data::cased::lookup); - test_boolean_property(test_data::CASED, char::is_cased); +fn lt() { + test_boolean_property(test_data::LT, unicode_data::lt::lookup); + test_boolean_property(test_data::LT, char::is_titlecase); } #[test] diff --git a/library/coretests/tests/unicode/test_data.rs b/library/coretests/tests/unicode/test_data.rs index 3071aedcae07..962770a0ff83 100644 --- a/library/coretests/tests/unicode/test_data.rs +++ b/library/coretests/tests/unicode/test_data.rs @@ -392,60 +392,6 @@ '\u{e0100}'..='\u{e01ef}', ]; -#[rustfmt::skip] -pub(super) static CASED: &[RangeInclusive; 156] = &[ - '\u{aa}'..='\u{aa}', '\u{b5}'..='\u{b5}', '\u{ba}'..='\u{ba}', '\u{c0}'..='\u{d6}', - '\u{d8}'..='\u{f6}', '\u{f8}'..='\u{1ba}', '\u{1bc}'..='\u{1bf}', '\u{1c4}'..='\u{293}', - '\u{296}'..='\u{2b8}', '\u{2c0}'..='\u{2c1}', '\u{2e0}'..='\u{2e4}', '\u{345}'..='\u{345}', - '\u{370}'..='\u{373}', '\u{376}'..='\u{377}', '\u{37a}'..='\u{37d}', '\u{37f}'..='\u{37f}', - '\u{386}'..='\u{386}', '\u{388}'..='\u{38a}', '\u{38c}'..='\u{38c}', '\u{38e}'..='\u{3a1}', - '\u{3a3}'..='\u{3f5}', '\u{3f7}'..='\u{481}', '\u{48a}'..='\u{52f}', '\u{531}'..='\u{556}', - '\u{560}'..='\u{588}', '\u{10a0}'..='\u{10c5}', '\u{10c7}'..='\u{10c7}', - '\u{10cd}'..='\u{10cd}', '\u{10d0}'..='\u{10fa}', '\u{10fc}'..='\u{10ff}', - '\u{13a0}'..='\u{13f5}', '\u{13f8}'..='\u{13fd}', '\u{1c80}'..='\u{1c8a}', - '\u{1c90}'..='\u{1cba}', '\u{1cbd}'..='\u{1cbf}', '\u{1d00}'..='\u{1dbf}', - '\u{1e00}'..='\u{1f15}', '\u{1f18}'..='\u{1f1d}', '\u{1f20}'..='\u{1f45}', - '\u{1f48}'..='\u{1f4d}', '\u{1f50}'..='\u{1f57}', '\u{1f59}'..='\u{1f59}', - '\u{1f5b}'..='\u{1f5b}', '\u{1f5d}'..='\u{1f5d}', '\u{1f5f}'..='\u{1f7d}', - '\u{1f80}'..='\u{1fb4}', '\u{1fb6}'..='\u{1fbc}', '\u{1fbe}'..='\u{1fbe}', - '\u{1fc2}'..='\u{1fc4}', '\u{1fc6}'..='\u{1fcc}', '\u{1fd0}'..='\u{1fd3}', - '\u{1fd6}'..='\u{1fdb}', '\u{1fe0}'..='\u{1fec}', '\u{1ff2}'..='\u{1ff4}', - '\u{1ff6}'..='\u{1ffc}', '\u{2071}'..='\u{2071}', '\u{207f}'..='\u{207f}', - '\u{2090}'..='\u{209c}', '\u{2102}'..='\u{2102}', '\u{2107}'..='\u{2107}', - '\u{210a}'..='\u{2113}', '\u{2115}'..='\u{2115}', '\u{2119}'..='\u{211d}', - '\u{2124}'..='\u{2124}', '\u{2126}'..='\u{2126}', '\u{2128}'..='\u{2128}', - '\u{212a}'..='\u{212d}', '\u{212f}'..='\u{2134}', '\u{2139}'..='\u{2139}', - '\u{213c}'..='\u{213f}', '\u{2145}'..='\u{2149}', '\u{214e}'..='\u{214e}', - '\u{2160}'..='\u{217f}', '\u{2183}'..='\u{2184}', '\u{24b6}'..='\u{24e9}', - '\u{2c00}'..='\u{2ce4}', '\u{2ceb}'..='\u{2cee}', '\u{2cf2}'..='\u{2cf3}', - '\u{2d00}'..='\u{2d25}', '\u{2d27}'..='\u{2d27}', '\u{2d2d}'..='\u{2d2d}', - '\u{a640}'..='\u{a66d}', '\u{a680}'..='\u{a69d}', '\u{a722}'..='\u{a787}', - '\u{a78b}'..='\u{a78e}', '\u{a790}'..='\u{a7dc}', '\u{a7f1}'..='\u{a7f6}', - '\u{a7f8}'..='\u{a7fa}', '\u{ab30}'..='\u{ab5a}', '\u{ab5c}'..='\u{ab69}', - '\u{ab70}'..='\u{abbf}', '\u{fb00}'..='\u{fb06}', '\u{fb13}'..='\u{fb17}', - '\u{ff21}'..='\u{ff3a}', '\u{ff41}'..='\u{ff5a}', '\u{10400}'..='\u{1044f}', - '\u{104b0}'..='\u{104d3}', '\u{104d8}'..='\u{104fb}', '\u{10570}'..='\u{1057a}', - '\u{1057c}'..='\u{1058a}', '\u{1058c}'..='\u{10592}', '\u{10594}'..='\u{10595}', - '\u{10597}'..='\u{105a1}', '\u{105a3}'..='\u{105b1}', '\u{105b3}'..='\u{105b9}', - '\u{105bb}'..='\u{105bc}', '\u{10780}'..='\u{10780}', '\u{10783}'..='\u{10785}', - '\u{10787}'..='\u{107b0}', '\u{107b2}'..='\u{107ba}', '\u{10c80}'..='\u{10cb2}', - '\u{10cc0}'..='\u{10cf2}', '\u{10d50}'..='\u{10d65}', '\u{10d70}'..='\u{10d85}', - '\u{118a0}'..='\u{118df}', '\u{16e40}'..='\u{16e7f}', '\u{16ea0}'..='\u{16eb8}', - '\u{16ebb}'..='\u{16ed3}', '\u{1d400}'..='\u{1d454}', '\u{1d456}'..='\u{1d49c}', - '\u{1d49e}'..='\u{1d49f}', '\u{1d4a2}'..='\u{1d4a2}', '\u{1d4a5}'..='\u{1d4a6}', - '\u{1d4a9}'..='\u{1d4ac}', '\u{1d4ae}'..='\u{1d4b9}', '\u{1d4bb}'..='\u{1d4bb}', - '\u{1d4bd}'..='\u{1d4c3}', '\u{1d4c5}'..='\u{1d505}', '\u{1d507}'..='\u{1d50a}', - '\u{1d50d}'..='\u{1d514}', '\u{1d516}'..='\u{1d51c}', '\u{1d51e}'..='\u{1d539}', - '\u{1d53b}'..='\u{1d53e}', '\u{1d540}'..='\u{1d544}', '\u{1d546}'..='\u{1d546}', - '\u{1d54a}'..='\u{1d550}', '\u{1d552}'..='\u{1d6a5}', '\u{1d6a8}'..='\u{1d6c0}', - '\u{1d6c2}'..='\u{1d6da}', '\u{1d6dc}'..='\u{1d6fa}', '\u{1d6fc}'..='\u{1d714}', - '\u{1d716}'..='\u{1d734}', '\u{1d736}'..='\u{1d74e}', '\u{1d750}'..='\u{1d76e}', - '\u{1d770}'..='\u{1d788}', '\u{1d78a}'..='\u{1d7a8}', '\u{1d7aa}'..='\u{1d7c2}', - '\u{1d7c4}'..='\u{1d7cb}', '\u{1df00}'..='\u{1df09}', '\u{1df0b}'..='\u{1df1e}', - '\u{1df25}'..='\u{1df2a}', '\u{1e030}'..='\u{1e06d}', '\u{1e900}'..='\u{1e943}', - '\u{1f130}'..='\u{1f149}', '\u{1f150}'..='\u{1f169}', '\u{1f170}'..='\u{1f189}', -]; - #[rustfmt::skip] pub(super) static GRAPHEME_EXTEND: &[RangeInclusive; 383] = &[ '\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}', @@ -776,6 +722,13 @@ '\u{1e030}'..='\u{1e06d}', '\u{1e922}'..='\u{1e943}', ]; +#[rustfmt::skip] +pub(super) static LT: &[RangeInclusive; 10] = &[ + '\u{1c5}'..='\u{1c5}', '\u{1c8}'..='\u{1c8}', '\u{1cb}'..='\u{1cb}', '\u{1f2}'..='\u{1f2}', + '\u{1f88}'..='\u{1f8f}', '\u{1f98}'..='\u{1f9f}', '\u{1fa8}'..='\u{1faf}', + '\u{1fbc}'..='\u{1fbc}', '\u{1fcc}'..='\u{1fcc}', '\u{1ffc}'..='\u{1ffc}', +]; + #[rustfmt::skip] pub(super) static N: &[RangeInclusive; 145] = &[ '\u{b2}'..='\u{b3}', '\u{b9}'..='\u{b9}', '\u{bc}'..='\u{be}', '\u{660}'..='\u{669}', diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index cdd137ff56a5..398b4c7b7ec5 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -91,11 +91,11 @@ "Alphabetic", "Lowercase", "Uppercase", - "Cased", "Case_Ignorable", "Grapheme_Extend", "White_Space", "N", + "Lt", ]; struct UnicodeData {