mirror of
https://github.com/rust-lang/rust.git
synced 2026-04-27 18:57:42 +03:00
Extend core::char's documentation of casing issues
* Extend `core::char`'s documentation of casing issues * Fix typos * Fix typo Co-authored-by: GrigorenkoPV <GrigorenkoPV+github@yandex.ru> * Document maximum 3x character expansion This is guaranteed by Unicode. * Fix error in `str` casing method docs
This commit is contained in:
@@ -335,13 +335,19 @@ pub fn replacen<P: Pattern>(&self, pat: P, to: &str, count: usize) -> String {
|
||||
|
||||
/// Returns the lowercase equivalent of this string slice, as a new [`String`].
|
||||
///
|
||||
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
|
||||
/// `Lowercase`.
|
||||
/// 'Lowercase' is defined according to the terms of
|
||||
/// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G34432)
|
||||
/// of the Unicode standard.
|
||||
///
|
||||
/// Since some characters can expand into multiple characters when changing
|
||||
/// the case, this function returns a [`String`] instead of modifying the
|
||||
/// parameter in-place.
|
||||
///
|
||||
/// Unlike [`char::to_lowercase()`], this method fully handles the context-dependent
|
||||
/// casing of Greek sigma. However, like that method, it does not handle locale-specific
|
||||
/// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation
|
||||
/// for more information.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic usage:
|
||||
@@ -426,13 +432,18 @@ fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
|
||||
|
||||
/// Returns the uppercase equivalent of this string slice, as a new [`String`].
|
||||
///
|
||||
/// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
|
||||
/// `Uppercase`.
|
||||
/// 'Uppercase' is defined according to the terms of
|
||||
/// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G34431)
|
||||
/// of the Unicode standard.
|
||||
///
|
||||
/// Since some characters can expand into multiple characters when changing
|
||||
/// the case, this function returns a [`String`] instead of modifying the
|
||||
/// parameter in-place.
|
||||
///
|
||||
/// Like [`char::to_uppercase()`] this method does not handle language-specific
|
||||
/// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation
|
||||
/// for more information.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// Basic usage:
|
||||
|
||||
@@ -1149,13 +1149,14 @@ pub fn is_numeric(self) -> bool {
|
||||
/// [ucd]: https://www.unicode.org/reports/tr44/
|
||||
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
||||
///
|
||||
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
|
||||
/// the `char`(s) given by [`SpecialCasing.txt`].
|
||||
/// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
|
||||
/// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3.
|
||||
///
|
||||
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
|
||||
///
|
||||
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
|
||||
/// is independent of context and language.
|
||||
/// is independent of context and language. See [below](#notes-on-context-and-locale)
|
||||
/// for more information.
|
||||
///
|
||||
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
|
||||
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
|
||||
@@ -1197,6 +1198,48 @@ pub fn is_numeric(self) -> bool {
|
||||
/// // convert into themselves.
|
||||
/// assert_eq!('山'.to_lowercase().to_string(), "山");
|
||||
/// ```
|
||||
/// # Notes on context and locale
|
||||
///
|
||||
/// As stated earlier, this method does not take into account language or context.
|
||||
/// Below is a non-exhaustive list of situations where this can be relevant.
|
||||
/// If you need to handle locale-depedendent casing in your code, consider using
|
||||
/// an external crate, like [`icu_casemap`](https://crates.io/crates/icu_casemap)
|
||||
/// which is developed by Unicode.
|
||||
///
|
||||
/// ## Greek sigma
|
||||
///
|
||||
/// In Greek, the letter simga (uppercase Σ) has two lowercase forms:
|
||||
/// ς which is used only at the end of a word, and σ which is used everywhere else.
|
||||
/// `to_lowercase()` always uses the second form:
|
||||
///
|
||||
/// ```
|
||||
/// assert_eq!('Σ'.to_lowercase().to_string(), "σ");
|
||||
/// ```
|
||||
///
|
||||
/// ## Turkish and Azeri I/ı/İ/i
|
||||
///
|
||||
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
|
||||
///
|
||||
/// * 'Dotless': I / ı, sometimes written ï
|
||||
/// * 'Dotted': İ / i
|
||||
///
|
||||
/// Note that the uppercase undotted 'I' is the same as the Latin. Therefore:
|
||||
///
|
||||
/// ```
|
||||
/// let lower_i = 'I'.to_lowercase().to_string();
|
||||
/// ```
|
||||
///
|
||||
/// The value of `lower_i` here relies on the language of the text: if we're
|
||||
/// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should
|
||||
/// be `"ı"`. `to_lowercase()` does not take this into account, and so:
|
||||
///
|
||||
/// ```
|
||||
/// let lower_i = 'I'.to_lowercase().to_string();
|
||||
///
|
||||
/// assert_eq!(lower_i, "i");
|
||||
/// ```
|
||||
///
|
||||
/// holds across languages.
|
||||
#[must_use = "this returns the lowercased character as a new iterator, \
|
||||
without modifying the original"]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
@@ -1209,8 +1252,10 @@ pub fn to_lowercase(self) -> ToLowercase {
|
||||
/// `char`s.
|
||||
///
|
||||
/// This is usually, but not always, equivalent to the uppercase mapping
|
||||
/// returned by [`Self::to_uppercase`]. Prefer this method when seeking to capitalize
|
||||
/// Only The First Letter of a word, but use [`Self::to_uppercase`] for ALL CAPS.
|
||||
/// returned by [`to_uppercase()`]. Prefer this method when seeking to capitalize
|
||||
/// Only The First Letter of a word, but use [`to_uppercase()`] for ALL CAPS.
|
||||
/// See [below](#difference-from-uppercase) for a thorough explanation
|
||||
/// of the difference between the two methods.
|
||||
///
|
||||
/// If this `char` does not have a titlecase mapping, the iterator yields the same `char`.
|
||||
///
|
||||
@@ -1220,13 +1265,14 @@ pub fn to_lowercase(self) -> ToLowercase {
|
||||
/// [ucd]: https://www.unicode.org/reports/tr44/
|
||||
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
||||
///
|
||||
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
|
||||
/// the `char`(s) given by [`SpecialCasing.txt`].
|
||||
/// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
|
||||
/// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3.
|
||||
///
|
||||
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
|
||||
///
|
||||
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
|
||||
/// is independent of context and language.
|
||||
/// is independent of context and language. See [below](#note-on-locale)
|
||||
/// for more information.
|
||||
///
|
||||
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
|
||||
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
|
||||
@@ -1263,8 +1309,9 @@ pub fn to_lowercase(self) -> ToLowercase {
|
||||
/// ```
|
||||
/// #![feature(titlecase)]
|
||||
/// assert_eq!('c'.to_titlecase().to_string(), "C");
|
||||
/// assert_eq!('ა'.to_titlecase().to_string(), "ა");
|
||||
/// assert_eq!('dž'.to_titlecase().to_string(), "Dž");
|
||||
/// assert_eq!('ῼ'.to_titlecase().to_string(), "ῼ");
|
||||
/// assert_eq!('ᾨ'.to_titlecase().to_string(), "ᾨ");
|
||||
///
|
||||
/// // Sometimes the result is more than one character:
|
||||
/// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
|
||||
@@ -1274,8 +1321,78 @@ pub fn to_lowercase(self) -> ToLowercase {
|
||||
/// assert_eq!('山'.to_titlecase().to_string(), "山");
|
||||
/// ```
|
||||
///
|
||||
/// # Difference from uppercase
|
||||
///
|
||||
/// Currently, there are three classes of characters where [`to_uppercase()`]
|
||||
/// and `to_titlecase()` give different results:
|
||||
///
|
||||
/// ## Georgian script
|
||||
///
|
||||
/// Each letter in the modern Georgian alphabet can be written in one of two forms:
|
||||
/// the typical lowercase-like "mkhedruli" form, and a variant uppercase-like "mtavruli"
|
||||
/// form. However, unlike uppercase in most cased scripts, mtavruli is not typically used
|
||||
/// to start sentences, denote proper nouns, or for any other purpose
|
||||
/// in running text. It is instead confined to titles and headings, which are written entirely
|
||||
/// in mtavruli. For this reason, [`to_uppercase()`] applied to a Georgian letter
|
||||
/// will return the mtavruli form, but `to_titlecase()` will return the mkhedruli form.
|
||||
///
|
||||
/// ```
|
||||
/// #![feature(titlecase)]
|
||||
/// let ani = 'ა'; // First letter of the Georgian alphabet, in mkhedruli form
|
||||
///
|
||||
/// // Titlecasing mkhedruli maps it to itself...
|
||||
/// assert_eq!(ani.to_titlecase().to_string(), ani.to_string());
|
||||
///
|
||||
/// // but uppercasing it maps it to mtavruli
|
||||
/// assert_eq!(ani.to_uppercase().to_string(), "Ა");
|
||||
/// ```
|
||||
///
|
||||
/// ## Compatibility digraphs for Latin-alphabet Serbo-Croatian
|
||||
///
|
||||
/// The standard Latin alphabet for the Serbo-Croatian language
|
||||
/// (Bosnian, Croatian, Montenegrin, and Serbian) contains
|
||||
/// three digraphs: Dž, Lj, and Nj. These are usually represented as
|
||||
/// two characters. However, for compatibility with older character sets,
|
||||
/// Unicode includes single-character versions of these digraphs.
|
||||
/// Each has a uppercase, titlecase, and lowercase version:
|
||||
///
|
||||
/// - `'DŽ'`, `'Dž'`, `'dž'`
|
||||
/// - `'LJ'`, `'Lj'`, `'lj'`
|
||||
/// - `'NJ'`, `'Nj'`, `'nj'`
|
||||
///
|
||||
/// Unicode additionally encodes a casing triad for the Dz digraph
|
||||
/// without the caron: `'DZ'`, `'Dz'`, `'dz'`.
|
||||
///
|
||||
/// ## Iota-subscritped Greek vowels
|
||||
///
|
||||
/// In ancient Greek, the long vowels alpha (α), eta (η), and omega (ω)
|
||||
/// were sometimes followed by an iota (ι), forming a diphthong. Over time,
|
||||
/// the diphthong pronunciation was slowly lost, with the iota becoming mute.
|
||||
/// Eventually, the ι disappeared from the spelling as well.
|
||||
/// However, there remains a need to represent ancient texts faithfully.
|
||||
///
|
||||
/// Modern editions of ancient Greek texts commonly use a reduced-sized
|
||||
/// ι symbol to denote mute iotas, while distinguishing them from ιs
|
||||
/// which continued to affect pronunciation. The exact standard differs
|
||||
/// between different publications. Some render the mute ι below its associated
|
||||
/// vowel (subscript), while others place it to the right of said vowel (adscript).
|
||||
/// The interaction of mute ι symbols with casing also varies.
|
||||
///
|
||||
/// The Unicode Standard, for its default casing rules, chose to make lowercase
|
||||
/// Greek vowels with iota subscipt (e.g. `'ᾠ'`) titlecase to the uppercase vowel
|
||||
/// with iota subscript (`'ᾨ'`) but uppercase to the uppercase vowel followed by
|
||||
/// full-size uppercase iota (`"ὨΙ"`). This is just one convention among many
|
||||
/// in common use, but it is the one Unicode settled on,
|
||||
/// so it is what this method does also.
|
||||
///
|
||||
/// # Note on locale
|
||||
///
|
||||
/// As stated above, this method is locale-insensitive.
|
||||
/// If you need locale support, consider using an external crate,
|
||||
/// like [`icu_casemap`](https://crates.io/crates/icu_casemap)
|
||||
/// which is developed by Unicode. A description of a common
|
||||
/// locale-dependent casing issue follows:
|
||||
///
|
||||
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
|
||||
///
|
||||
/// * 'Dotless': I / ı, sometimes written ï
|
||||
@@ -1300,6 +1417,8 @@ pub fn to_lowercase(self) -> ToLowercase {
|
||||
/// ```
|
||||
///
|
||||
/// holds across languages.
|
||||
///
|
||||
/// [`to_uppercase()`]: Self::to_uppercase()
|
||||
#[must_use = "this returns the titlecased character as a new iterator, \
|
||||
without modifying the original"]
|
||||
#[unstable(feature = "titlecase", issue = "153892")]
|
||||
@@ -1311,8 +1430,9 @@ pub fn to_titlecase(self) -> ToTitlecase {
|
||||
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
|
||||
/// `char`s.
|
||||
///
|
||||
/// Prefer this method when converting a word into ALL CAPS, but consider [`Self::to_titlecase`]
|
||||
/// instead if you seek to capitalize Only The First Letter.
|
||||
/// Prefer this method when converting a word into ALL CAPS, but consider [`to_titlecase()`]
|
||||
/// instead if you seek to capitalize Only The First Letter. See that method's documentation
|
||||
/// for more information on the difference between the two.
|
||||
///
|
||||
/// If this `char` does not have an uppercase mapping, the iterator yields the same `char`.
|
||||
///
|
||||
@@ -1322,13 +1442,14 @@ pub fn to_titlecase(self) -> ToTitlecase {
|
||||
/// [ucd]: https://www.unicode.org/reports/tr44/
|
||||
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
|
||||
///
|
||||
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
|
||||
/// the `char`(s) given by [`SpecialCasing.txt`].
|
||||
/// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by
|
||||
/// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3.
|
||||
///
|
||||
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
|
||||
///
|
||||
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
|
||||
/// is independent of context and language.
|
||||
/// is independent of context and language. See [below](#note-on-locale)
|
||||
/// for more information.
|
||||
///
|
||||
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
|
||||
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
|
||||
@@ -1336,6 +1457,7 @@ pub fn to_titlecase(self) -> ToTitlecase {
|
||||
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// `'ſt'` (U+FB05) is a single Unicode code point (a ligature) that maps to "ST" in uppercase.
|
||||
///
|
||||
/// As an iterator:
|
||||
@@ -1363,11 +1485,12 @@ pub fn to_titlecase(self) -> ToTitlecase {
|
||||
///
|
||||
/// ```
|
||||
/// assert_eq!('c'.to_uppercase().to_string(), "C");
|
||||
/// assert_eq!('ა'.to_uppercase().to_string(), "Ა");
|
||||
/// assert_eq!('dž'.to_uppercase().to_string(), "DŽ");
|
||||
///
|
||||
/// // Sometimes the result is more than one character:
|
||||
/// assert_eq!('ſt'.to_uppercase().to_string(), "ST");
|
||||
/// assert_eq!('ῼ'.to_uppercase().to_string(), "ΩΙ");
|
||||
/// assert_eq!('ᾨ'.to_uppercase().to_string(), "ὨΙ");
|
||||
///
|
||||
/// // Characters that do not have both uppercase and lowercase
|
||||
/// // convert into themselves.
|
||||
@@ -1376,6 +1499,12 @@ pub fn to_titlecase(self) -> ToTitlecase {
|
||||
///
|
||||
/// # Note on locale
|
||||
///
|
||||
/// As stated above, this method is locale-insensitive.
|
||||
/// If you need locale support, consider using an external crate,
|
||||
/// like [`icu_casemap`](https://crates.io/crates/icu_casemap)
|
||||
/// which is developed by Unicode. A description of a common
|
||||
/// locale-dependent casing issue follows:
|
||||
///
|
||||
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
|
||||
///
|
||||
/// * 'Dotless': I / ı, sometimes written ï
|
||||
@@ -1398,6 +1527,8 @@ pub fn to_titlecase(self) -> ToTitlecase {
|
||||
/// ```
|
||||
///
|
||||
/// holds across languages.
|
||||
///
|
||||
/// [`to_titlecase()`]: Self::to_titlecase()
|
||||
#[must_use = "this returns the uppercased character as a new iterator, \
|
||||
without modifying the original"]
|
||||
#[stable(feature = "rust1", since = "1.0.0")]
|
||||
|
||||
Reference in New Issue
Block a user