From bda3ceda039f0d37c726ca62e0bac457ce39d071 Mon Sep 17 00:00:00 2001 From: Steve Klabnik Date: Thu, 28 Aug 2014 13:56:55 -0400 Subject: [PATCH 1/2] Add note about string indexing. Thanks @chris-morgan! --- src/doc/guide-strings.md | 56 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/doc/guide-strings.md b/src/doc/guide-strings.md index 6c6d6e3689935..bf762d13b780d 100644 --- a/src/doc/guide-strings.md +++ b/src/doc/guide-strings.md @@ -121,6 +121,62 @@ fn compare(string: String) { Converting a `String` to a `&str` is cheap, but converting the `&str` to a `String` involves an allocation. +## Indexing strings + +You may be tempted to try to access a certain character of a `String`, like +this: + +```{rust,ignore} +let s = "hello".to_string(); + +println!("{}", s[0]); +``` + +This does not compile. This is on purpose. In the world of UTF-8, direct +indexing is basically never what you want to do. The reason is that each +charater can be a variable number of bytes. This means that you have to iterate +through the characters anyway, which is a O(n) operation. + +To iterate over a string, use the `graphemes()` method on `&str`: + +```{rust} +let s = "αἰθήρ"; + +for l in s.graphemes(true) { + println!("{}", l); +} +``` + +This will print out each character in turn, as you'd expect: first "α", then +"ἰ", etc. You can see that this is different than just the individual bytes. +Here's a version that prints out each byte: + +```{rust} +let s = "αἰθήρ"; + +for l in s.as_bytes().iter() { + println!("{}", l); +} +``` + +This will print: + +```{notrust,ignore} +206 +177 +225 +188 +176 +206 +184 +206 +174 +207 +129 +``` + +Many more bytes than graphemes! + # Other Documentation * [the `&str` API documentation](/std/str/index.html) From 8ddb9c71c3f4c18a2679d498d5fe65a9b6516270 Mon Sep 17 00:00:00 2001 From: Steve Klabnik Date: Thu, 28 Aug 2014 14:05:33 -0400 Subject: [PATCH 2/2] Add section about Str trait --- src/doc/guide-strings.md | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/doc/guide-strings.md b/src/doc/guide-strings.md index bf762d13b780d..a49132ec8be94 100644 --- a/src/doc/guide-strings.md +++ b/src/doc/guide-strings.md @@ -92,9 +92,33 @@ fn foo(s: String) { ``` If you have good reason. It's not polite to hold on to ownership you don't -need, and it can make your lifetimes more complex. Furthermore, you can pass -either kind of string into `foo` by using `.as_slice()` on any `String` you -need to pass in, so the `&str` version is more flexible. +need, and it can make your lifetimes more complex. + +## Generic functions + +To write a function that's generic over types of strings, use [the `Str` +trait](http://doc.rust-lang.org/std/str/trait.Str.html): + +```{rust} +fn some_string_length(x: T) -> uint { + x.as_slice().len() +} + +fn main() { + let s = "Hello, world"; + + println!("{}", some_string_length(s)); + + let s = "Hello, world".to_string(); + + println!("{}", some_string_length(s)); +} +``` + +Both of these lines will print `12`. + +The only method that the `Str` trait has is `as_slice()`, which gives you +access to a `&str` value from the underlying string. ## Comparisons @@ -134,7 +158,7 @@ println!("{}", s[0]); This does not compile. This is on purpose. In the world of UTF-8, direct indexing is basically never what you want to do. The reason is that each -charater can be a variable number of bytes. This means that you have to iterate +character can be a variable number of bytes. This means that you have to iterate through the characters anyway, which is a O(n) operation. To iterate over a string, use the `graphemes()` method on `&str`: @@ -147,6 +171,9 @@ for l in s.graphemes(true) { } ``` +Note that `l` has the type `&str` here, since a single grapheme can consist of +multiple codepoints, so a `char` wouldn't be appropriate. + This will print out each character in turn, as you'd expect: first "α", then "ἰ", etc. You can see that this is different than just the individual bytes. Here's a version that prints out each byte: @@ -154,7 +181,7 @@ Here's a version that prints out each byte: ```{rust} let s = "αἰθήρ"; -for l in s.as_bytes().iter() { +for l in s.bytes() { println!("{}", l); } ```