Files
rust/src/libcore/char.rs
T
Alex Crichton 1b568ba0fd std: Hide facade extension traits in docs
These traits are currently all just unstable parts of the facade which are
implementation details for primitives further up the facade. This may make it
more difficult to find what set of methods you get if only linking to libcore,
but for now that's also unstable behavior.

Closes #22025
2015-04-07 17:54:35 -07:00

373 lines
12 KiB
Rust

// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Character manipulation.
//!
//! For more details, see ::unicode::char (a.k.a. std::char)
#![allow(non_snake_case)]
#![doc(primitive = "char")]
use iter::Iterator;
use mem::transmute;
use option::Option::{None, Some};
use option::Option;
use slice::SliceExt;
// UTF-8 ranges and tags for encoding characters
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
const MAX_ONE_B: u32 = 0x80;
const MAX_TWO_B: u32 = 0x800;
const MAX_THREE_B: u32 = 0x10000;
/*
Lu Uppercase_Letter an uppercase letter
Ll Lowercase_Letter a lowercase letter
Lt Titlecase_Letter a digraphic character, with first part uppercase
Lm Modifier_Letter a modifier letter
Lo Other_Letter other letters, including syllables and ideographs
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
Mc Spacing_Mark a spacing combining mark (positive advance width)
Me Enclosing_Mark an enclosing combining mark
Nd Decimal_Number a decimal digit
Nl Letter_Number a letterlike numeric character
No Other_Number a numeric character of other type
Pc Connector_Punctuation a connecting punctuation mark, like a tie
Pd Dash_Punctuation a dash or hyphen punctuation mark
Ps Open_Punctuation an opening punctuation mark (of a pair)
Pe Close_Punctuation a closing punctuation mark (of a pair)
Pi Initial_Punctuation an initial quotation mark
Pf Final_Punctuation a final quotation mark
Po Other_Punctuation a punctuation mark of other type
Sm Math_Symbol a symbol of primarily mathematical use
Sc Currency_Symbol a currency sign
Sk Modifier_Symbol a non-letterlike modifier symbol
So Other_Symbol a symbol of other type
Zs Space_Separator a space character (of various non-zero widths)
Zl Line_Separator U+2028 LINE SEPARATOR only
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
Cc Control a C0 or C1 control code
Cf Format a format control character
Cs Surrogate a surrogate code point
Co Private_Use a private-use character
Cn Unassigned a reserved unassigned code point or a noncharacter
*/
/// The highest valid code point
#[stable(feature = "rust1", since = "1.0.0")]
pub const MAX: char = '\u{10ffff}';
/// Converts a `u32` to an `Option<char>`.
///
/// # Examples
///
/// ```
/// use std::char;
///
/// let c = char::from_u32(10084); // produces `Some(❤)`
/// assert_eq!(c, Some('❤'));
/// ```
///
/// An invalid character:
///
/// ```
/// use std::char;
///
/// let none = char::from_u32(1114112);
/// assert_eq!(none, None);
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_u32(i: u32) -> Option<char> {
// catch out-of-bounds and surrogates
if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
None
} else {
Some(unsafe { transmute(i) })
}
}
/// Converts a number to the character representing it.
///
/// # Return value
///
/// Returns `Some(char)` if `num` represents one digit under `radix`,
/// using one character of `0-9` or `a-z`, or `None` if it doesn't.
///
/// # Panics
///
/// Panics if given an `radix` > 36.
///
/// # Examples
///
/// ```
/// use std::char;
///
/// let c = char::from_digit(4, 10);
///
/// assert_eq!(c, Some('4'));
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_digit(num: u32, radix: u32) -> Option<char> {
if radix > 36 {
panic!("from_digit: radix is too high (maximum 36)");
}
if num < radix {
unsafe {
if num < 10 {
Some(transmute('0' as u32 + num))
} else {
Some(transmute('a' as u32 + num - 10))
}
}
} else {
None
}
}
// NB: the stabilization and documentation for this trait is in
// unicode/char.rs, not here
#[allow(missing_docs)] // docs in libunicode/u_char.rs
#[doc(hidden)]
pub trait CharExt {
fn is_digit(self, radix: u32) -> bool;
fn to_digit(self, radix: u32) -> Option<u32>;
fn escape_unicode(self) -> EscapeUnicode;
fn escape_default(self) -> EscapeDefault;
fn len_utf8(self) -> usize;
fn len_utf16(self) -> usize;
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
}
impl CharExt for char {
fn is_digit(self, radix: u32) -> bool {
self.to_digit(radix).is_some()
}
fn to_digit(self, radix: u32) -> Option<u32> {
if radix > 36 {
panic!("to_digit: radix is too high (maximum 36)");
}
let val = match self {
'0' ... '9' => self as u32 - '0' as u32,
'a' ... 'z' => self as u32 - 'a' as u32 + 10,
'A' ... 'Z' => self as u32 - 'A' as u32 + 10,
_ => return None,
};
if val < radix { Some(val) }
else { None }
}
fn escape_unicode(self) -> EscapeUnicode {
EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash }
}
fn escape_default(self) -> EscapeDefault {
let init_state = match self {
'\t' => EscapeDefaultState::Backslash('t'),
'\r' => EscapeDefaultState::Backslash('r'),
'\n' => EscapeDefaultState::Backslash('n'),
'\\' => EscapeDefaultState::Backslash('\\'),
'\'' => EscapeDefaultState::Backslash('\''),
'"' => EscapeDefaultState::Backslash('"'),
'\x20' ... '\x7e' => EscapeDefaultState::Char(self),
_ => EscapeDefaultState::Unicode(self.escape_unicode())
};
EscapeDefault { state: init_state }
}
#[inline]
fn len_utf8(self) -> usize {
let code = self as u32;
if code < MAX_ONE_B {
1
} else if code < MAX_TWO_B {
2
} else if code < MAX_THREE_B {
3
} else {
4
}
}
#[inline]
fn len_utf16(self) -> usize {
let ch = self as u32;
if (ch & 0xFFFF) == ch { 1 } else { 2 }
}
#[inline]
fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
encode_utf8_raw(self as u32, dst)
}
#[inline]
fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
encode_utf16_raw(self as u32, dst)
}
}
/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the number of bytes written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
// Marked #[inline] to allow llvm optimizing it away
if code < MAX_ONE_B && dst.len() >= 1 {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
/// and then returns the number of `u16`s written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
// Marked #[inline] to allow llvm optimizing it away
if (ch & 0xFFFF) == ch && dst.len() >= 1 {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000;
dst[0] = 0xD800 | ((ch >> 10) as u16);
dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
Some(2)
} else {
None
}
}
/// An iterator over the characters that represent a `char`, as escaped by
/// Rust's unicode escaping rules.
#[derive(Clone)]
#[stable(feature = "rust1", since = "1.0.0")]
pub struct EscapeUnicode {
c: char,
state: EscapeUnicodeState
}
#[derive(Clone)]
enum EscapeUnicodeState {
Backslash,
Type,
LeftBrace,
Value(usize),
RightBrace,
Done,
}
#[stable(feature = "rust1", since = "1.0.0")]
impl Iterator for EscapeUnicode {
type Item = char;
fn next(&mut self) -> Option<char> {
match self.state {
EscapeUnicodeState::Backslash => {
self.state = EscapeUnicodeState::Type;
Some('\\')
}
EscapeUnicodeState::Type => {
self.state = EscapeUnicodeState::LeftBrace;
Some('u')
}
EscapeUnicodeState::LeftBrace => {
let mut n = 0;
while (self.c as u32) >> (4 * (n + 1)) != 0 {
n += 1;
}
self.state = EscapeUnicodeState::Value(n);
Some('{')
}
EscapeUnicodeState::Value(offset) => {
let v = match ((self.c as i32) >> (offset * 4)) & 0xf {
i @ 0 ... 9 => '0' as i32 + i,
i => 'a' as i32 + (i - 10)
};
if offset == 0 {
self.state = EscapeUnicodeState::RightBrace;
} else {
self.state = EscapeUnicodeState::Value(offset - 1);
}
Some(unsafe { transmute(v) })
}
EscapeUnicodeState::RightBrace => {
self.state = EscapeUnicodeState::Done;
Some('}')
}
EscapeUnicodeState::Done => None,
}
}
}
/// An iterator over the characters that represent a `char`, escaped
/// for maximum portability.
#[derive(Clone)]
#[stable(feature = "rust1", since = "1.0.0")]
pub struct EscapeDefault {
state: EscapeDefaultState
}
#[derive(Clone)]
enum EscapeDefaultState {
Backslash(char),
Char(char),
Done,
Unicode(EscapeUnicode),
}
#[stable(feature = "rust1", since = "1.0.0")]
impl Iterator for EscapeDefault {
type Item = char;
fn next(&mut self) -> Option<char> {
match self.state {
EscapeDefaultState::Backslash(c) => {
self.state = EscapeDefaultState::Char(c);
Some('\\')
}
EscapeDefaultState::Char(c) => {
self.state = EscapeDefaultState::Done;
Some(c)
}
EscapeDefaultState::Done => None,
EscapeDefaultState::Unicode(ref mut iter) => iter.next()
}
}
}