From af7c313047b40f8d1a5cedc8df6a64339f6de01e Mon Sep 17 00:00:00 2001 From: Luca Bruno Date: Fri, 7 Mar 2014 23:53:34 +0100 Subject: [PATCH 1/3] doc: don't refer to 'char' as characters This seems to be causing some confusion among users. Rust's char are not 8bit characters, but 32bit UCS-4 codepoint without surrogates (Unicode Scalar Values as per Unicode glossary). Make the doc more explicit about it. Signed-off-by: Luca Bruno --- src/libstd/char.rs | 50 +++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/libstd/char.rs b/src/libstd/char.rs index ed2a88e644b08..c07d3e75017b4 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -8,7 +8,21 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Unicode characters manipulation (`char` type) +//! Character manipulation (`char` type, Unicode Scalar Value) +//! +//! This module provides the `Char` trait, as well as its implementation +//! for the primitive `char` type, in order to allow basic character manipulation. +//! +//! A `char` actually represents a +//! *[Unicode Scalar Value](http://www.unicode.org/glossary/#unicode_scalar_value)*, +//! as it can contain any Unicode code point except high-surrogate and +//! low-surrogate code points. +//! +//! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\] +//! (inclusive) are allowed. A `char` can always be safely cast to a `u32`; +//! however the converse is not always true due to the above range limits +//! and, as such, should be performed via the `from_u32` function.. + use cast::transmute; use option::{None, Option, Some}; @@ -66,7 +80,7 @@ static TAG_FOUR_B: uint = 240u; /// The highest valid code point pub static MAX: char = '\U0010ffff'; -/// Convert from `u32` to a character. +/// Convert from `u32` to a `char`. #[inline] pub fn from_u32(i: u32) -> Option { // catch out-of-bounds and surrogates @@ -77,8 +91,8 @@ pub fn from_u32(i: u32) -> Option { } } -/// Returns whether the specified character is considered a unicode alphabetic -/// character +/// Returns whether the specified `char` is considered a unicode alphabetic +/// scalar value pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) } #[allow(missing_doc)] pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) } @@ -86,21 +100,21 @@ pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) } pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) } /// -/// Indicates whether a character is in lower case, defined +/// Indicates whether a `char` is in lower case, defined /// in terms of the Unicode Derived Core Property 'Lowercase'. /// #[inline] pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) } /// -/// Indicates whether a character is in upper case, defined +/// Indicates whether a `char` is in upper case, defined /// in terms of the Unicode Derived Core Property 'Uppercase'. /// #[inline] pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) } /// -/// Indicates whether a character is whitespace. Whitespace is defined in +/// Indicates whether a `char` is whitespace. Whitespace is defined in /// terms of the Unicode Property 'White_Space'. /// #[inline] @@ -112,7 +126,7 @@ pub fn is_whitespace(c: char) -> bool { } /// -/// Indicates whether a character is alphanumeric. Alphanumericness is +/// Indicates whether a `char` is alphanumeric. Alphanumericness is /// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No' /// and the Derived Core Property 'Alphabetic'. /// @@ -125,14 +139,14 @@ pub fn is_alphanumeric(c: char) -> bool { } /// -/// Indicates whether a character is a control character. Control -/// characters are defined in terms of the Unicode General Category +/// Indicates whether a `char` is a control code point. Control +/// code points are defined in terms of the Unicode General Category /// 'Cc'. /// #[inline] pub fn is_control(c: char) -> bool { general_category::Cc(c) } -/// Indicates whether the character is numeric (Nd, Nl, or No) +/// Indicates whether the `char` is numeric (Nd, Nl, or No) #[inline] pub fn is_digit(c: char) -> bool { general_category::Nd(c) @@ -141,7 +155,7 @@ pub fn is_digit(c: char) -> bool { } /// -/// Checks if a character parses as a numeric digit in the given radix. +/// Checks if a `char` parses as a numeric digit in the given radix. /// Compared to `is_digit()`, this function only recognizes the /// characters `0-9`, `a-z` and `A-Z`. /// @@ -167,13 +181,13 @@ pub fn is_digit_radix(c: char, radix: uint) -> bool { } /// -/// Convert a char to the corresponding digit. +/// Convert a `char` to the corresponding digit. /// /// # Return value /// /// If `c` is between '0' and '9', the corresponding value /// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is -/// 'b' or 'B', 11, etc. Returns none if the char does not +/// 'b' or 'B', 11, etc. Returns none if the `char` does not /// refer to a digit in the given radix. /// /// # Failure @@ -273,7 +287,7 @@ pub fn decompose_compatible(c: char, f: |char|) { } /// -/// Return the hexadecimal unicode escape of a char. +/// Return the hexadecimal unicode escape of a `char`. /// /// The rules are as follows: /// @@ -301,7 +315,7 @@ pub fn escape_unicode(c: char, f: |char|) { } /// -/// Return a 'default' ASCII and C++11-like char-literal escape of a char. +/// Return a 'default' ASCII and C++11-like literal escape of a `char`. /// /// The default is chosen with a bias toward producing literals that are /// legal in a variety of languages, including C++11 and similar C-family @@ -325,7 +339,7 @@ pub fn escape_default(c: char, f: |char|) { } } -/// Returns the amount of bytes this character would need if encoded in utf8 +/// Returns the amount of bytes this `char` would need if encoded in UTF-8 pub fn len_utf8_bytes(c: char) -> uint { static MAX_ONE_B: uint = 128u; static MAX_TWO_B: uint = 2048u; @@ -360,7 +374,7 @@ pub trait Char { fn escape_default(&self, f: |char|); fn len_utf8_bytes(&self) -> uint; - /// Encodes this character as utf-8 into the provided byte-buffer. The + /// Encodes this `char` as utf-8 into the provided byte-buffer. The /// buffer must be at least 4 bytes long or a runtime failure will occur. /// /// This will then return the number of characters written to the slice. From 9b3c63bdd5ef9d772e3c74061761e5c0b326f9c0 Mon Sep 17 00:00:00 2001 From: Luca Bruno Date: Sat, 8 Mar 2014 00:13:54 +0100 Subject: [PATCH 2/3] doc: uniform std::char doc-strings Uniform and beautify doc-string for current rustdoc output. Signed-off-by: Luca Bruno --- src/libstd/char.rs | 53 ++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/libstd/char.rs b/src/libstd/char.rs index c07d3e75017b4..ce9336d4909ac 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -80,7 +80,7 @@ static TAG_FOUR_B: uint = 240u; /// The highest valid code point pub static MAX: char = '\U0010ffff'; -/// Convert from `u32` to a `char`. +/// Converts from `u32` to a `char` #[inline] pub fn from_u32(i: u32) -> Option { // catch out-of-bounds and surrogates @@ -91,8 +91,8 @@ pub fn from_u32(i: u32) -> Option { } } -/// Returns whether the specified `char` is considered a unicode alphabetic -/// scalar value +/// Returns whether the specified `char` is considered a Unicode alphabetic +/// code point pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) } #[allow(missing_doc)] pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) } @@ -100,22 +100,25 @@ pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) } pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) } /// -/// Indicates whether a `char` is in lower case, defined -/// in terms of the Unicode Derived Core Property 'Lowercase'. +/// Indicates whether a `char` is in lower case +/// +/// This is defined according to the terms of the Unicode Derived Core Property 'Lowercase'. /// #[inline] pub fn is_lowercase(c: char) -> bool { derived_property::Lowercase(c) } /// -/// Indicates whether a `char` is in upper case, defined -/// in terms of the Unicode Derived Core Property 'Uppercase'. +/// Indicates whether a `char` is in upper case +/// +/// This is defined according to the terms of the Unicode Derived Core Property 'Uppercase'. /// #[inline] pub fn is_uppercase(c: char) -> bool { derived_property::Uppercase(c) } /// -/// Indicates whether a `char` is whitespace. Whitespace is defined in -/// terms of the Unicode Property 'White_Space'. +/// Indicates whether a `char` is whitespace +/// +/// Whitespace is defined in terms of the Unicode Property 'White_Space'. /// #[inline] pub fn is_whitespace(c: char) -> bool { @@ -126,9 +129,10 @@ pub fn is_whitespace(c: char) -> bool { } /// -/// Indicates whether a `char` is alphanumeric. Alphanumericness is -/// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No' -/// and the Derived Core Property 'Alphabetic'. +/// Indicates whether a `char` is alphanumeric +/// +/// Alphanumericness is defined in terms of the Unicode General Categories +/// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'. /// #[inline] pub fn is_alphanumeric(c: char) -> bool { @@ -139,8 +143,9 @@ pub fn is_alphanumeric(c: char) -> bool { } /// -/// Indicates whether a `char` is a control code point. Control -/// code points are defined in terms of the Unicode General Category +/// Indicates whether a `char` is a control code point +/// +/// Control code points are defined in terms of the Unicode General Category /// 'Cc'. /// #[inline] @@ -155,7 +160,8 @@ pub fn is_digit(c: char) -> bool { } /// -/// Checks if a `char` parses as a numeric digit in the given radix. +/// Checks if a `char` parses as a numeric digit in the given radix +/// /// Compared to `is_digit()`, this function only recognizes the /// characters `0-9`, `a-z` and `A-Z`. /// @@ -181,7 +187,7 @@ pub fn is_digit_radix(c: char, radix: uint) -> bool { } /// -/// Convert a `char` to the corresponding digit. +/// Converts a `char` to the corresponding digit /// /// # Return value /// @@ -210,7 +216,7 @@ pub fn to_digit(c: char, radix: uint) -> Option { } /// -/// Converts a number to the character representing it. +/// Converts a number to the character representing it /// /// # Return value /// @@ -268,7 +274,7 @@ fn decompose_hangul(s: char, f: |char|) { } } -/// Returns the canonical decomposition of a character. +/// Returns the canonical decomposition of a character pub fn decompose_canonical(c: char, f: |char|) { if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { decompose::canonical(c, f); @@ -277,7 +283,7 @@ pub fn decompose_canonical(c: char, f: |char|) { } } -/// Returns the compatibility decomposition of a character. +/// Returns the compatibility decomposition of a character pub fn decompose_compatible(c: char, f: |char|) { if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { decompose::compatibility(c, f); @@ -287,7 +293,7 @@ pub fn decompose_compatible(c: char, f: |char|) { } /// -/// Return the hexadecimal unicode escape of a `char`. +/// Returns the hexadecimal Unicode escape of a `char` /// /// The rules are as follows: /// @@ -315,7 +321,7 @@ pub fn escape_unicode(c: char, f: |char|) { } /// -/// Return a 'default' ASCII and C++11-like literal escape of a `char`. +/// Returns a 'default' ASCII and C++11-like literal escape of a `char` /// /// The default is chosen with a bias toward producing literals that are /// legal in a variety of languages, including C++11 and similar C-family @@ -374,8 +380,9 @@ pub trait Char { fn escape_default(&self, f: |char|); fn len_utf8_bytes(&self) -> uint; - /// Encodes this `char` as utf-8 into the provided byte-buffer. The - /// buffer must be at least 4 bytes long or a runtime failure will occur. + /// Encodes this `char` as utf-8 into the provided byte-buffer + /// + /// The buffer must be at least 4 bytes long or a runtime failure will occur. /// /// This will then return the number of characters written to the slice. fn encode_utf8(&self, dst: &mut [u8]) -> uint; From 331f9077a035b201e7bddfabae503ba72c3ab1de Mon Sep 17 00:00:00 2001 From: Luca Bruno Date: Sat, 8 Mar 2014 00:27:49 +0100 Subject: [PATCH 3/3] doc: add two missing char methods doc-strings XID_* property are defined in UAX #31, just reference it here. Signed-off-by: Luca Bruno --- src/libstd/char.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/libstd/char.rs b/src/libstd/char.rs index ce9336d4909ac..4c0f77586db44 100644 --- a/src/libstd/char.rs +++ b/src/libstd/char.rs @@ -94,9 +94,19 @@ pub fn from_u32(i: u32) -> Option { /// Returns whether the specified `char` is considered a Unicode alphabetic /// code point pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) } -#[allow(missing_doc)] + +/// Returns whether the specified `char` satisfies the 'XID_Start' Unicode property +/// +/// 'XID_Start' is a Unicode Derived Property specified in +/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications), +/// mostly similar to ID_Start but modified for closure under NFKx. pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) } -#[allow(missing_doc)] + +/// Returns whether the specified `char` satisfies the 'XID_Continue' Unicode property +/// +/// 'XID_Continue' is a Unicode Derived Property specified in +/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications), +/// mostly similar to 'ID_Continue' but modified for closure under NFKx. pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) } ///