diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 8a3326c7d76a..d7dd616fce77 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -335,13 +335,19 @@ pub fn replacen(&self, pat: P, to: &str, count: usize) -> String { /// Returns the lowercase equivalent of this string slice, as a new [`String`]. /// - /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property - /// `Lowercase`. + /// 'Lowercase' is defined according to the terms of + /// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G34432) + /// of the Unicode standard. /// /// Since some characters can expand into multiple characters when changing /// the case, this function returns a [`String`] instead of modifying the /// parameter in-place. /// + /// Unlike [`char::to_lowercase()`], this method fully handles the context-dependent + /// casing of Greek sigma. However, like that method, it does not handle locale-specific + /// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation + /// for more information. + /// /// # Examples /// /// Basic usage: @@ -426,13 +432,18 @@ fn case_ignorable_then_cased>(iter: I) -> bool { /// Returns the uppercase equivalent of this string slice, as a new [`String`]. /// - /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property - /// `Uppercase`. + /// 'Uppercase' is defined according to the terms of + /// [Chapter 3 (Conformance)](https://www.unicode.org/versions/latest/core-spec/chapter-3/#G34431) + /// of the Unicode standard. /// /// Since some characters can expand into multiple characters when changing /// the case, this function returns a [`String`] instead of modifying the /// parameter in-place. /// + /// Like [`char::to_uppercase()`] this method does not handle language-specific + /// casing, like Turkish and Azeri I/ı/İ/i. See that method's documentation + /// for more information. + /// /// # Examples /// /// Basic usage: diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index caf4e9f5917f..00b735e91a37 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1151,13 +1151,14 @@ pub fn is_numeric(self) -> bool { /// [ucd]: https://www.unicode.org/reports/tr44/ /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// - /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields - /// the `char`(s) given by [`SpecialCasing.txt`]. + /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by + /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3. /// /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt /// /// This operation performs an unconditional mapping without tailoring. That is, the conversion - /// is independent of context and language. + /// is independent of context and language. See [below](#notes-on-context-and-locale) + /// for more information. /// /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. @@ -1199,6 +1200,48 @@ pub fn is_numeric(self) -> bool { /// // convert into themselves. /// assert_eq!('山'.to_lowercase().to_string(), "山"); /// ``` + /// # Notes on context and locale + /// + /// As stated earlier, this method does not take into account language or context. + /// Below is a non-exhaustive list of situations where this can be relevant. + /// If you need to handle locale-depedendent casing in your code, consider using + /// an external crate, like [`icu_casemap`](https://crates.io/crates/icu_casemap) + /// which is developed by Unicode. + /// + /// ## Greek sigma + /// + /// In Greek, the letter simga (uppercase Σ) has two lowercase forms: + /// ς which is used only at the end of a word, and σ which is used everywhere else. + /// `to_lowercase()` always uses the second form: + /// + /// ``` + /// assert_eq!('Σ'.to_lowercase().to_string(), "σ"); + /// ``` + /// + /// ## Turkish and Azeri I/ı/İ/i + /// + /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: + /// + /// * 'Dotless': I / ı, sometimes written ï + /// * 'Dotted': İ / i + /// + /// Note that the uppercase undotted 'I' is the same as the Latin. Therefore: + /// + /// ``` + /// let lower_i = 'I'.to_lowercase().to_string(); + /// ``` + /// + /// The value of `lower_i` here relies on the language of the text: if we're + /// in `en-US`, it should be `"i"`, but if we're in `tr-TR` or `az-AZ`, it should + /// be `"ı"`. `to_lowercase()` does not take this into account, and so: + /// + /// ``` + /// let lower_i = 'I'.to_lowercase().to_string(); + /// + /// assert_eq!(lower_i, "i"); + /// ``` + /// + /// holds across languages. #[must_use = "this returns the lowercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] @@ -1211,8 +1254,10 @@ pub fn to_lowercase(self) -> ToLowercase { /// `char`s. /// /// This is usually, but not always, equivalent to the uppercase mapping - /// returned by [`Self::to_uppercase`]. Prefer this method when seeking to capitalize - /// Only The First Letter of a word, but use [`Self::to_uppercase`] for ALL CAPS. + /// returned by [`to_uppercase()`]. Prefer this method when seeking to capitalize + /// Only The First Letter of a word, but use [`to_uppercase()`] for ALL CAPS. + /// See [below](#difference-from-uppercase) for a thorough explanation + /// of the difference between the two methods. /// /// If this `char` does not have a titlecase mapping, the iterator yields the same `char`. /// @@ -1222,13 +1267,14 @@ pub fn to_lowercase(self) -> ToLowercase { /// [ucd]: https://www.unicode.org/reports/tr44/ /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// - /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields - /// the `char`(s) given by [`SpecialCasing.txt`]. + /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by + /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3. /// /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt /// /// This operation performs an unconditional mapping without tailoring. That is, the conversion - /// is independent of context and language. + /// is independent of context and language. See [below](#note-on-locale) + /// for more information. /// /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. @@ -1265,8 +1311,9 @@ pub fn to_lowercase(self) -> ToLowercase { /// ``` /// #![feature(titlecase)] /// assert_eq!('c'.to_titlecase().to_string(), "C"); + /// assert_eq!('ა'.to_titlecase().to_string(), "ა"); /// assert_eq!('dž'.to_titlecase().to_string(), "Dž"); - /// assert_eq!('ῼ'.to_titlecase().to_string(), "ῼ"); + /// assert_eq!('ᾨ'.to_titlecase().to_string(), "ᾨ"); /// /// // Sometimes the result is more than one character: /// assert_eq!('ß'.to_titlecase().to_string(), "Ss"); @@ -1276,8 +1323,78 @@ pub fn to_lowercase(self) -> ToLowercase { /// assert_eq!('山'.to_titlecase().to_string(), "山"); /// ``` /// + /// # Difference from uppercase + /// + /// Currently, there are three classes of characters where [`to_uppercase()`] + /// and `to_titlecase()` give different results: + /// + /// ## Georgian script + /// + /// Each letter in the modern Georgian alphabet can be written in one of two forms: + /// the typical lowercase-like "mkhedruli" form, and a variant uppercase-like "mtavruli" + /// form. However, unlike uppercase in most cased scripts, mtavruli is not typically used + /// to start sentences, denote proper nouns, or for any other purpose + /// in running text. It is instead confined to titles and headings, which are written entirely + /// in mtavruli. For this reason, [`to_uppercase()`] applied to a Georgian letter + /// will return the mtavruli form, but `to_titlecase()` will return the mkhedruli form. + /// + /// ``` + /// #![feature(titlecase)] + /// let ani = 'ა'; // First letter of the Georgian alphabet, in mkhedruli form + /// + /// // Titlecasing mkhedruli maps it to itself... + /// assert_eq!(ani.to_titlecase().to_string(), ani.to_string()); + /// + /// // but uppercasing it maps it to mtavruli + /// assert_eq!(ani.to_uppercase().to_string(), "Ა"); + /// ``` + /// + /// ## Compatibility digraphs for Latin-alphabet Serbo-Croatian + /// + /// The standard Latin alphabet for the Serbo-Croatian language + /// (Bosnian, Croatian, Montenegrin, and Serbian) contains + /// three digraphs: Dž, Lj, and Nj. These are usually represented as + /// two characters. However, for compatibility with older character sets, + /// Unicode includes single-character versions of these digraphs. + /// Each has a uppercase, titlecase, and lowercase version: + /// + /// - `'DŽ'`, `'Dž'`, `'dž'` + /// - `'LJ'`, `'Lj'`, `'lj'` + /// - `'NJ'`, `'Nj'`, `'nj'` + /// + /// Unicode additionally encodes a casing triad for the Dz digraph + /// without the caron: `'DZ'`, `'Dz'`, `'dz'`. + /// + /// ## Iota-subscritped Greek vowels + /// + /// In ancient Greek, the long vowels alpha (α), eta (η), and omega (ω) + /// were sometimes followed by an iota (ι), forming a diphthong. Over time, + /// the diphthong pronunciation was slowly lost, with the iota becoming mute. + /// Eventually, the ι disappeared from the spelling as well. + /// However, there remains a need to represent ancient texts faithfully. + /// + /// Modern editions of ancient Greek texts commonly use a reduced-sized + /// ι symbol to denote mute iotas, while distinguishing them from ιs + /// which continued to affect pronunciation. The exact standard differs + /// between different publications. Some render the mute ι below its associated + /// vowel (subscript), while others place it to the right of said vowel (adscript). + /// The interaction of mute ι symbols with casing also varies. + /// + /// The Unicode Standard, for its default casing rules, chose to make lowercase + /// Greek vowels with iota subscipt (e.g. `'ᾠ'`) titlecase to the uppercase vowel + /// with iota subscript (`'ᾨ'`) but uppercase to the uppercase vowel followed by + /// full-size uppercase iota (`"ὨΙ"`). This is just one convention among many + /// in common use, but it is the one Unicode settled on, + /// so it is what this method does also. + /// /// # Note on locale /// + /// As stated above, this method is locale-insensitive. + /// If you need locale support, consider using an external crate, + /// like [`icu_casemap`](https://crates.io/crates/icu_casemap) + /// which is developed by Unicode. A description of a common + /// locale-dependent casing issue follows: + /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï @@ -1302,6 +1419,8 @@ pub fn to_lowercase(self) -> ToLowercase { /// ``` /// /// holds across languages. + /// + /// [`to_uppercase()`]: Self::to_uppercase() #[must_use = "this returns the titlecased character as a new iterator, \ without modifying the original"] #[unstable(feature = "titlecase", issue = "153892")] @@ -1313,8 +1432,9 @@ pub fn to_titlecase(self) -> ToTitlecase { /// Returns an iterator that yields the uppercase mapping of this `char` as one or more /// `char`s. /// - /// Prefer this method when converting a word into ALL CAPS, but consider [`Self::to_titlecase`] - /// instead if you seek to capitalize Only The First Letter. + /// Prefer this method when converting a word into ALL CAPS, but consider [`to_titlecase()`] + /// instead if you seek to capitalize Only The First Letter. See that method's documentation + /// for more information on the difference between the two. /// /// If this `char` does not have an uppercase mapping, the iterator yields the same `char`. /// @@ -1324,13 +1444,14 @@ pub fn to_titlecase(self) -> ToTitlecase { /// [ucd]: https://www.unicode.org/reports/tr44/ /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt /// - /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields - /// the `char`(s) given by [`SpecialCasing.txt`]. + /// If this `char` expands to multiple `char`s, the iterator yields the `char`s given by + /// [`SpecialCasing.txt`]. The maximum number of `char`s in a case mapping is 3. /// /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt /// /// This operation performs an unconditional mapping without tailoring. That is, the conversion - /// is independent of context and language. + /// is independent of context and language. See [below](#note-on-locale) + /// for more information. /// /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. @@ -1338,6 +1459,7 @@ pub fn to_titlecase(self) -> ToTitlecase { /// [Unicode Standard]: https://www.unicode.org/versions/latest/ /// /// # Examples + /// /// `'ſt'` (U+FB05) is a single Unicode code point (a ligature) that maps to "ST" in uppercase. /// /// As an iterator: @@ -1365,11 +1487,12 @@ pub fn to_titlecase(self) -> ToTitlecase { /// /// ``` /// assert_eq!('c'.to_uppercase().to_string(), "C"); + /// assert_eq!('ა'.to_uppercase().to_string(), "Ა"); /// assert_eq!('dž'.to_uppercase().to_string(), "DŽ"); /// /// // Sometimes the result is more than one character: /// assert_eq!('ſt'.to_uppercase().to_string(), "ST"); - /// assert_eq!('ῼ'.to_uppercase().to_string(), "ΩΙ"); + /// assert_eq!('ᾨ'.to_uppercase().to_string(), "ὨΙ"); /// /// // Characters that do not have both uppercase and lowercase /// // convert into themselves. @@ -1378,6 +1501,12 @@ pub fn to_titlecase(self) -> ToTitlecase { /// /// # Note on locale /// + /// As stated above, this method is locale-insensitive. + /// If you need locale support, consider using an external crate, + /// like [`icu_casemap`](https://crates.io/crates/icu_casemap) + /// which is developed by Unicode. A description of a common + /// locale-dependent casing issue follows: + /// /// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two: /// /// * 'Dotless': I / ı, sometimes written ï @@ -1400,6 +1529,8 @@ pub fn to_titlecase(self) -> ToTitlecase { /// ``` /// /// holds across languages. + /// + /// [`to_titlecase()`]: Self::to_titlecase() #[must_use = "this returns the uppercased character as a new iterator, \ without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] diff --git a/src/librustdoc/html/markdown.rs b/src/librustdoc/html/markdown.rs index 858545bd0984..2034abdfd156 100644 --- a/src/librustdoc/html/markdown.rs +++ b/src/librustdoc/html/markdown.rs @@ -582,6 +582,7 @@ fn next(&mut self) -> Option { } } let id = self.id_map.derive(id); + let percent_encoded_id = small_url_encode(id.clone()); if let Some(ref mut builder) = self.toc { let mut text_header = String::new(); @@ -596,8 +597,9 @@ fn next(&mut self) -> Option { std::cmp::min(level as u32 + (self.heading_offset as u32), MAX_HEADER_LEVEL); self.buf.push_back((Event::Html(format!("").into()), 0..0)); - let start_tags = - format!("§"); + let start_tags = format!( + "§" + ); return Some((Event::Html(start_tags.into()), 0..0)); } event diff --git a/tests/rustdoc-html/unicode.rs b/tests/rustdoc-html/unicode.rs new file mode 100644 index 000000000000..a961f178ec3b --- /dev/null +++ b/tests/rustdoc-html/unicode.rs @@ -0,0 +1,10 @@ +#![crate_name = "unicode"] + +pub struct Foo; + +impl Foo { + //@ has unicode/struct.Foo.html //a/@href "#%C3%BA" + //@ !has unicode/struct.Foo.html //a/@href "#ú" + /// # ú + pub fn foo() {} +}