mirror of
https://github.com/rust-lang/rust.git
synced 2026-04-27 18:57:42 +03:00
Rollup merge of #154699 - Jules-Bertholet:lt-for-cased, r=scottmcm
`core::unicode`: Replace `Cased` table with `Lt` Shaves off 368 bytes from the total size of all Unicode data tables. @rustbot label A-unicode
This commit is contained in:
@@ -810,7 +810,7 @@ pub fn is_cased(self) -> bool {
|
||||
match self {
|
||||
'a'..='z' | 'A'..='Z' => true,
|
||||
'\0'..='\u{A9}' => false,
|
||||
_ => unicode::Cased(self),
|
||||
_ => unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -840,10 +840,10 @@ pub fn case(self) -> Option<CharCase> {
|
||||
'a'..='z' => Some(CharCase::Lower),
|
||||
'A'..='Z' => Some(CharCase::Upper),
|
||||
'\0'..='\u{A9}' => None,
|
||||
_ if !unicode::Cased(self) => None,
|
||||
_ if unicode::Lowercase(self) => Some(CharCase::Lower),
|
||||
_ if unicode::Uppercase(self) => Some(CharCase::Upper),
|
||||
_ => Some(CharCase::Title),
|
||||
_ if unicode::Lt(self) => Some(CharCase::Title),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -919,7 +919,7 @@ pub const fn is_lowercase(self) -> bool {
|
||||
pub fn is_titlecase(self) -> bool {
|
||||
match self {
|
||||
'\0'..='\u{01C4}' => false,
|
||||
_ => self.is_cased() && !self.is_lowercase() && !self.is_uppercase(),
|
||||
_ => unicode::Lt(self),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,9 +9,9 @@
|
||||
#[rustfmt::skip]
|
||||
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
|
||||
pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable;
|
||||
pub(crate) use unicode_data::cased::lookup as Cased;
|
||||
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
|
||||
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
|
||||
pub(crate) use unicode_data::lt::lookup as Lt;
|
||||
pub(crate) use unicode_data::n::lookup as N;
|
||||
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
|
||||
pub(crate) use unicode_data::white_space::lookup as White_Space;
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
|
||||
// Alphabetic : 1723 bytes, 147369 codepoints in 759 ranges (U+0000AA - U+03347A) using skiplist
|
||||
// Case_Ignorable : 1063 bytes, 2789 codepoints in 459 ranges (U+0000A8 - U+0E01F0) using skiplist
|
||||
// Cased : 401 bytes, 4580 codepoints in 156 ranges (U+0000AA - U+01F18A) using skiplist
|
||||
// Grapheme_Extend : 899 bytes, 2232 codepoints in 383 ranges (U+000300 - U+0E01F0) using skiplist
|
||||
// Lowercase : 943 bytes, 2569 codepoints in 676 ranges (U+0000AA - U+01E944) using bitset
|
||||
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
|
||||
// N : 463 bytes, 1914 codepoints in 145 ranges (U+0000B2 - U+01FBFA) using skiplist
|
||||
// Uppercase : 799 bytes, 1980 codepoints in 659 ranges (U+0000C0 - U+01F18A) using bitset
|
||||
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
|
||||
// to_lower : 1112 bytes, 1462 codepoints in 185 ranges (U+0000C0 - U+01E921) using 2-level LUT
|
||||
// to_upper : 1998 bytes, 1554 codepoints in 299 ranges (U+0000B5 - U+01E943) using 2-level LUT
|
||||
// to_title : 340 bytes, 135 codepoints in 49 ranges (U+0000DF - U+00FB17) using 2-level LUT
|
||||
// Total : 9997 bytes
|
||||
// Total : 9629 bytes
|
||||
|
||||
#[inline(always)]
|
||||
const fn bitset_search<
|
||||
@@ -337,59 +337,6 @@ fn lookup_slow(c: char) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub mod cased {
|
||||
use super::ShortOffsetRunHeader;
|
||||
|
||||
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
|
||||
ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024),
|
||||
ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958),
|
||||
ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264),
|
||||
ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824),
|
||||
ShortOffsetRunHeader::new(177, 64256), ShortOffsetRunHeader::new(183, 65313),
|
||||
ShortOffsetRunHeader::new(187, 66560), ShortOffsetRunHeader::new(191, 67456),
|
||||
ShortOffsetRunHeader::new(213, 68736), ShortOffsetRunHeader::new(221, 71840),
|
||||
ShortOffsetRunHeader::new(229, 93760), ShortOffsetRunHeader::new(231, 119808),
|
||||
ShortOffsetRunHeader::new(237, 120486), ShortOffsetRunHeader::new(274, 122624),
|
||||
ShortOffsetRunHeader::new(297, 122928), ShortOffsetRunHeader::new(303, 125184),
|
||||
ShortOffsetRunHeader::new(305, 127280), ShortOffsetRunHeader::new(307, 1241482),
|
||||
];
|
||||
static OFFSETS: [u8; 313] = [
|
||||
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 2, 35, 7, 2, 30, 5, 96, 1, 42, 4,
|
||||
2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1,
|
||||
5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8,
|
||||
1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116,
|
||||
1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4,
|
||||
5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132,
|
||||
102, 3, 4, 1, 77, 20, 6, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96,
|
||||
36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0,
|
||||
51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 32, 25, 2, 25, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2,
|
||||
4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25,
|
||||
1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0,
|
||||
62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
|
||||
];
|
||||
#[inline]
|
||||
pub fn lookup(c: char) -> bool {
|
||||
debug_assert!(!c.is_ascii());
|
||||
(c as u32) >= 0xaa && lookup_slow(c)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn lookup_slow(c: char) -> bool {
|
||||
const {
|
||||
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
|
||||
let mut i = 0;
|
||||
while i < SHORT_OFFSET_RUNS.len() {
|
||||
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
|
||||
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
|
||||
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
|
||||
}
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub mod grapheme_extend {
|
||||
use super::ShortOffsetRunHeader;
|
||||
@@ -574,6 +521,39 @@ pub const fn lookup(c: char) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub mod lt {
|
||||
use super::ShortOffsetRunHeader;
|
||||
|
||||
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
|
||||
ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
|
||||
ShortOffsetRunHeader::new(9, 1122301),
|
||||
];
|
||||
static OFFSETS: [u8; 21] = [
|
||||
0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
|
||||
];
|
||||
#[inline]
|
||||
pub fn lookup(c: char) -> bool {
|
||||
debug_assert!(!c.is_ascii());
|
||||
(c as u32) >= 0x1c5 && lookup_slow(c)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn lookup_slow(c: char) -> bool {
|
||||
const {
|
||||
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
|
||||
let mut i = 0;
|
||||
while i < SHORT_OFFSET_RUNS.len() {
|
||||
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
|
||||
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
|
||||
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
|
||||
}
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub mod n {
|
||||
use super::ShortOffsetRunHeader;
|
||||
|
||||
@@ -60,9 +60,9 @@ fn case_ignorable() {
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn cased() {
|
||||
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
|
||||
test_boolean_property(test_data::CASED, char::is_cased);
|
||||
fn lt() {
|
||||
test_boolean_property(test_data::LT, unicode_data::lt::lookup);
|
||||
test_boolean_property(test_data::LT, char::is_titlecase);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -392,60 +392,6 @@
|
||||
'\u{e0100}'..='\u{e01ef}',
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub(super) static CASED: &[RangeInclusive<char>; 156] = &[
|
||||
'\u{aa}'..='\u{aa}', '\u{b5}'..='\u{b5}', '\u{ba}'..='\u{ba}', '\u{c0}'..='\u{d6}',
|
||||
'\u{d8}'..='\u{f6}', '\u{f8}'..='\u{1ba}', '\u{1bc}'..='\u{1bf}', '\u{1c4}'..='\u{293}',
|
||||
'\u{296}'..='\u{2b8}', '\u{2c0}'..='\u{2c1}', '\u{2e0}'..='\u{2e4}', '\u{345}'..='\u{345}',
|
||||
'\u{370}'..='\u{373}', '\u{376}'..='\u{377}', '\u{37a}'..='\u{37d}', '\u{37f}'..='\u{37f}',
|
||||
'\u{386}'..='\u{386}', '\u{388}'..='\u{38a}', '\u{38c}'..='\u{38c}', '\u{38e}'..='\u{3a1}',
|
||||
'\u{3a3}'..='\u{3f5}', '\u{3f7}'..='\u{481}', '\u{48a}'..='\u{52f}', '\u{531}'..='\u{556}',
|
||||
'\u{560}'..='\u{588}', '\u{10a0}'..='\u{10c5}', '\u{10c7}'..='\u{10c7}',
|
||||
'\u{10cd}'..='\u{10cd}', '\u{10d0}'..='\u{10fa}', '\u{10fc}'..='\u{10ff}',
|
||||
'\u{13a0}'..='\u{13f5}', '\u{13f8}'..='\u{13fd}', '\u{1c80}'..='\u{1c8a}',
|
||||
'\u{1c90}'..='\u{1cba}', '\u{1cbd}'..='\u{1cbf}', '\u{1d00}'..='\u{1dbf}',
|
||||
'\u{1e00}'..='\u{1f15}', '\u{1f18}'..='\u{1f1d}', '\u{1f20}'..='\u{1f45}',
|
||||
'\u{1f48}'..='\u{1f4d}', '\u{1f50}'..='\u{1f57}', '\u{1f59}'..='\u{1f59}',
|
||||
'\u{1f5b}'..='\u{1f5b}', '\u{1f5d}'..='\u{1f5d}', '\u{1f5f}'..='\u{1f7d}',
|
||||
'\u{1f80}'..='\u{1fb4}', '\u{1fb6}'..='\u{1fbc}', '\u{1fbe}'..='\u{1fbe}',
|
||||
'\u{1fc2}'..='\u{1fc4}', '\u{1fc6}'..='\u{1fcc}', '\u{1fd0}'..='\u{1fd3}',
|
||||
'\u{1fd6}'..='\u{1fdb}', '\u{1fe0}'..='\u{1fec}', '\u{1ff2}'..='\u{1ff4}',
|
||||
'\u{1ff6}'..='\u{1ffc}', '\u{2071}'..='\u{2071}', '\u{207f}'..='\u{207f}',
|
||||
'\u{2090}'..='\u{209c}', '\u{2102}'..='\u{2102}', '\u{2107}'..='\u{2107}',
|
||||
'\u{210a}'..='\u{2113}', '\u{2115}'..='\u{2115}', '\u{2119}'..='\u{211d}',
|
||||
'\u{2124}'..='\u{2124}', '\u{2126}'..='\u{2126}', '\u{2128}'..='\u{2128}',
|
||||
'\u{212a}'..='\u{212d}', '\u{212f}'..='\u{2134}', '\u{2139}'..='\u{2139}',
|
||||
'\u{213c}'..='\u{213f}', '\u{2145}'..='\u{2149}', '\u{214e}'..='\u{214e}',
|
||||
'\u{2160}'..='\u{217f}', '\u{2183}'..='\u{2184}', '\u{24b6}'..='\u{24e9}',
|
||||
'\u{2c00}'..='\u{2ce4}', '\u{2ceb}'..='\u{2cee}', '\u{2cf2}'..='\u{2cf3}',
|
||||
'\u{2d00}'..='\u{2d25}', '\u{2d27}'..='\u{2d27}', '\u{2d2d}'..='\u{2d2d}',
|
||||
'\u{a640}'..='\u{a66d}', '\u{a680}'..='\u{a69d}', '\u{a722}'..='\u{a787}',
|
||||
'\u{a78b}'..='\u{a78e}', '\u{a790}'..='\u{a7dc}', '\u{a7f1}'..='\u{a7f6}',
|
||||
'\u{a7f8}'..='\u{a7fa}', '\u{ab30}'..='\u{ab5a}', '\u{ab5c}'..='\u{ab69}',
|
||||
'\u{ab70}'..='\u{abbf}', '\u{fb00}'..='\u{fb06}', '\u{fb13}'..='\u{fb17}',
|
||||
'\u{ff21}'..='\u{ff3a}', '\u{ff41}'..='\u{ff5a}', '\u{10400}'..='\u{1044f}',
|
||||
'\u{104b0}'..='\u{104d3}', '\u{104d8}'..='\u{104fb}', '\u{10570}'..='\u{1057a}',
|
||||
'\u{1057c}'..='\u{1058a}', '\u{1058c}'..='\u{10592}', '\u{10594}'..='\u{10595}',
|
||||
'\u{10597}'..='\u{105a1}', '\u{105a3}'..='\u{105b1}', '\u{105b3}'..='\u{105b9}',
|
||||
'\u{105bb}'..='\u{105bc}', '\u{10780}'..='\u{10780}', '\u{10783}'..='\u{10785}',
|
||||
'\u{10787}'..='\u{107b0}', '\u{107b2}'..='\u{107ba}', '\u{10c80}'..='\u{10cb2}',
|
||||
'\u{10cc0}'..='\u{10cf2}', '\u{10d50}'..='\u{10d65}', '\u{10d70}'..='\u{10d85}',
|
||||
'\u{118a0}'..='\u{118df}', '\u{16e40}'..='\u{16e7f}', '\u{16ea0}'..='\u{16eb8}',
|
||||
'\u{16ebb}'..='\u{16ed3}', '\u{1d400}'..='\u{1d454}', '\u{1d456}'..='\u{1d49c}',
|
||||
'\u{1d49e}'..='\u{1d49f}', '\u{1d4a2}'..='\u{1d4a2}', '\u{1d4a5}'..='\u{1d4a6}',
|
||||
'\u{1d4a9}'..='\u{1d4ac}', '\u{1d4ae}'..='\u{1d4b9}', '\u{1d4bb}'..='\u{1d4bb}',
|
||||
'\u{1d4bd}'..='\u{1d4c3}', '\u{1d4c5}'..='\u{1d505}', '\u{1d507}'..='\u{1d50a}',
|
||||
'\u{1d50d}'..='\u{1d514}', '\u{1d516}'..='\u{1d51c}', '\u{1d51e}'..='\u{1d539}',
|
||||
'\u{1d53b}'..='\u{1d53e}', '\u{1d540}'..='\u{1d544}', '\u{1d546}'..='\u{1d546}',
|
||||
'\u{1d54a}'..='\u{1d550}', '\u{1d552}'..='\u{1d6a5}', '\u{1d6a8}'..='\u{1d6c0}',
|
||||
'\u{1d6c2}'..='\u{1d6da}', '\u{1d6dc}'..='\u{1d6fa}', '\u{1d6fc}'..='\u{1d714}',
|
||||
'\u{1d716}'..='\u{1d734}', '\u{1d736}'..='\u{1d74e}', '\u{1d750}'..='\u{1d76e}',
|
||||
'\u{1d770}'..='\u{1d788}', '\u{1d78a}'..='\u{1d7a8}', '\u{1d7aa}'..='\u{1d7c2}',
|
||||
'\u{1d7c4}'..='\u{1d7cb}', '\u{1df00}'..='\u{1df09}', '\u{1df0b}'..='\u{1df1e}',
|
||||
'\u{1df25}'..='\u{1df2a}', '\u{1e030}'..='\u{1e06d}', '\u{1e900}'..='\u{1e943}',
|
||||
'\u{1f130}'..='\u{1f149}', '\u{1f150}'..='\u{1f169}', '\u{1f170}'..='\u{1f189}',
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub(super) static GRAPHEME_EXTEND: &[RangeInclusive<char>; 383] = &[
|
||||
'\u{300}'..='\u{36f}', '\u{483}'..='\u{489}', '\u{591}'..='\u{5bd}', '\u{5bf}'..='\u{5bf}',
|
||||
@@ -776,6 +722,13 @@
|
||||
'\u{1e030}'..='\u{1e06d}', '\u{1e922}'..='\u{1e943}',
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub(super) static LT: &[RangeInclusive<char>; 10] = &[
|
||||
'\u{1c5}'..='\u{1c5}', '\u{1c8}'..='\u{1c8}', '\u{1cb}'..='\u{1cb}', '\u{1f2}'..='\u{1f2}',
|
||||
'\u{1f88}'..='\u{1f8f}', '\u{1f98}'..='\u{1f9f}', '\u{1fa8}'..='\u{1faf}',
|
||||
'\u{1fbc}'..='\u{1fbc}', '\u{1fcc}'..='\u{1fcc}', '\u{1ffc}'..='\u{1ffc}',
|
||||
];
|
||||
|
||||
#[rustfmt::skip]
|
||||
pub(super) static N: &[RangeInclusive<char>; 145] = &[
|
||||
'\u{b2}'..='\u{b3}', '\u{b9}'..='\u{b9}', '\u{bc}'..='\u{be}', '\u{660}'..='\u{669}',
|
||||
|
||||
@@ -91,11 +91,11 @@
|
||||
"Alphabetic",
|
||||
"Lowercase",
|
||||
"Uppercase",
|
||||
"Cased",
|
||||
"Case_Ignorable",
|
||||
"Grapheme_Extend",
|
||||
"White_Space",
|
||||
"N",
|
||||
"Lt",
|
||||
];
|
||||
|
||||
struct UnicodeData {
|
||||
|
||||
Reference in New Issue
Block a user