auto merge of #15619 : kwantam/rust/master, r=huonw

- `width()` computes the displayed width of a string, ignoring the width of control characters. - arguably we might do *something* else for control characters, but the question is, what? - users who want to do something else can iterate over chars() - `graphemes()` returns a `Graphemes` struct, which implements an iterator over the grapheme clusters of a &str. - fully compliant with [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) - passes all [Unicode-supplied tests](http://www.unicode.org/reports/tr41/tr41-15.html#Tests29) - added code to generate additionial categories in `unicode.py` - `Cn` aka `Not_Assigned` - categories necessary for grapheme cluster breaking - tidied up the exports from libunicode - all exports are exposed through a module rather than directly at crate root. - std::prelude imports UnicodeChar and UnicodeStrSlice from std::char and std::str rather than directly from libunicode closes #7043

auto merge of #15619 : kwantam/rust/master, r=huonw
- `width()` computes the displayed width of a string, ignoring the width of control characters. - arguably we might do *something* else for control characters, but the question is, what? - users who want to do something else can iterate over chars() - `graphemes()` returns a `Graphemes` struct, which implements an iterator over the grapheme clusters of a &str. - fully compliant with [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) - passes all [Unicode-supplied tests](http://www.unicode.org/reports/tr41/tr41-15.html#Tests29) - added code to generate additionial categories in `unicode.py` - `Cn` aka `Not_Assigned` - categories necessary for grapheme cluster breaking - tidied up the exports from libunicode - all exports are exposed through a module rather than directly at crate root. - std::prelude imports UnicodeChar and UnicodeStrSlice from std::char and std::str rather than directly from libunicode closes #7043
2692ae1d · bors · de111e69 · cf432b8f · 2692ae1d · 2692ae1d
9 changed file
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -51,6 +51,30 @@ expanded_categories = {
    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
 }

+
+# Grapheme cluster data
+# taken from UAX29, http://www.unicode.org/reports/tr29/
+# these code points are excluded from the Control category
+# NOTE: CR and LF are also technically excluded, but for
+# the sake of convenience we leave them in the Control group
+# and manually check them in the appropriate place. This is
+# still compliant with the implementation requirements.
+grapheme_control_exceptions = set([0x200c, 0x200d])
+
+# the Regional_Indicator category
+grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)]
+
+# "The following ... are specifically excluded" from the SpacingMark category
+# http://www.unicode.org/reports/tr29/#SpacingMark
+grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038),
+    (0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c),
+    (0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9),
+    (0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64),
+    (0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)]
+
+# these are included in the SpacingMark category
+grapheme_spacingmark_extra = set([0xe33, 0xeb3])
+
 def fetch(f):
    if not os.path.exists(f):
        os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -109,7 +133,7 @@ def load_unicode_data(f):
                canon_decomp[code] = seq

        # place letter in categories as appropriate
-        for cat in [gencat] + expanded_categories.get(gencat, []):
+        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
            if cat not in gencats:
                gencats[cat] = []
            gencats[cat].append(code)
@@ -120,6 +144,12 @@ def load_unicode_data(f):
                combines[combine] = []
            combines[combine].append(code)

+    # generate Not_Assigned from Assigned
+    gencats["Cn"] = gen_unassigned(gencats["Assigned"])
+    # Assigned is not a real category
+    del(gencats["Assigned"])
+    # Other contains Not_Assigned
+    gencats["C"].extend(gencats["Cn"])
    gencats = group_cats(gencats)
    combines = to_combines(group_cats(combines))

@@ -155,6 +185,11 @@ def ungroup_cat(cat):
            lo += 1
    return cat_out

+def gen_unassigned(assigned):
+    assigned = set(assigned)
+    return ([i for i in range(0, 0xd800) if i not in assigned] +
+            [i for i in range(0xe000, 0x110000) if i not in assigned])
+
 def to_combines(combs):
    combs_out = []
    for comb in combs:
@@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower):
        sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
    f.write("}\n\n")

+def emit_grapheme_module(f, grapheme_table, grapheme_cats):
+    f.write("""pub mod grapheme {
+    use core::option::{Some, None};
+    use core::slice::ImmutableVector;
+
+    #[allow(non_camel_case_types)]
+    #[deriving(Clone)]
+    pub enum GraphemeCat {
+""")
+    for cat in grapheme_cats + ["Any"]:
+        f.write("        GC_" + cat + ",\n")
+    f.write("""    }
+
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
+        use core::cmp::{Equal, Less, Greater};
+        match r.bsearch(|&(lo, hi, _)| {
+            if lo <= c && c <= hi { Equal }
+            else if hi < c { Less }
+            else { Greater }
+        }) {
+            Some(idx) => {
+                let (_, _, cat) = r[idx];
+                cat
+            }
+            None => GC_Any
+        }
+    }
+
+    pub fn grapheme_category(c: char) -> GraphemeCat {
+        bsearch_range_value_table(c, grapheme_cat_table)
+    }
+
+""")
+
+    emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
+        pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
+        is_pub=False)
+    f.write("}\n")
+
 def emit_charwidth_module(f, width_table):
    f.write("pub mod charwidth {\n")
    f.write("    use core::option::{Option, Some, None};\n")
@@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table):
    f.write("    //     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
    emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
            pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
-    f.write("}\n")
+    f.write("}\n\n")

 def emit_norm_module(f, canon, compat, combine):
    canon_keys = canon.keys()
@@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val):
        wtable_out.extend(wtable)
    return wtable_out

+
+
 def optimize_width_table(wtable):
    wtable_out = []
    w_this = wtable.pop(0)
@@ -487,7 +563,7 @@ def optimize_width_table(wtable):
    return wtable_out

 if __name__ == "__main__":
-    r = "unicode.rs"
+    r = "tables.rs"
    if os.path.exists(r):
        os.remove(r)
    with open(r, "w") as rf:
@@ -498,12 +574,18 @@ if __name__ == "__main__":
        (canon_decomp, compat_decomp, gencats, combines,
                lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
        want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
-        other_derived = ["Default_Ignorable_Code_Point"]
+        other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
        derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
        scripts = load_properties("Scripts.txt", [])
        props = load_properties("PropList.txt",
                ["White_Space", "Join_Control", "Noncharacter_Code_Point"])

+        # grapheme cluster category from DerivedCoreProperties
+        # the rest are defined below
+        grapheme_cats = {}
+        grapheme_cats["Extend"] = derived["Grapheme_Extend"]
+        del(derived["Grapheme_Extend"])
+
        # bsearch_range_table is used in all the property modules below
        emit_bsearch_range_table(rf)

@@ -533,7 +615,7 @@ if __name__ == "__main__":
        emit_norm_module(rf, canon_decomp, compat_decomp, combines)
        emit_conversions_module(rf, lowerupper, upperlower)

-        # character width module
+        ### character width module
        width_table = []
        for zwcat in ["Me", "Mn", "Cf"]:
            width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
@@ -555,3 +637,40 @@ if __name__ == "__main__":
        # optimize the width table by collapsing adjacent entities when possible
        width_table = optimize_width_table(width_table)
        emit_charwidth_module(rf, width_table)
+
+        ### grapheme cluster module
+        # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
+        # Hangul syllable categories
+        want_hangul = ["L", "V", "T", "LV", "LVT"]
+        grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul))
+
+        # Control
+        # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
+        # Unicode Scalar Values only, and surrogates are thus invalid `char`s.
+        grapheme_cats["Control"] = set()
+        for cat in ["Zl", "Zp", "Cc", "Cf"]:
+            grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat]))
+        grapheme_cats["Control"] = group_cat(list(
+            grapheme_cats["Control"]
+            - grapheme_control_exceptions
+            | (set(ungroup_cat(gencats["Cn"]))
+               & set(ungroup_cat(derived["Default_Ignorable_Code_Point"])))))
+
+        # Regional Indicator
+        grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator
+
+        # Prepend - "Currently there are no characters with this value"
+        # (from UAX#29, Unicode 7.0)
+
+        # SpacingMark
+        grapheme_cats["SpacingMark"] = group_cat(list(
+            set(ungroup_cat(gencats["Mc"]))
+            - set(ungroup_cat(grapheme_cats["Extend"]))
+            | grapheme_spacingmark_extra
+            - set(ungroup_cat(grapheme_spacingmark_exceptions))))
+
+        grapheme_table = []
+        for cat in grapheme_cats:
+            grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
+        grapheme_table.sort(key=lambda w: w[0])
+        emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())
--- a/src/libcollections/str.rs
+++ b/src/libcollections/str.rs
--- a/src/libstd/io/mod.rs
+++ b/src/libstd/io/mod.rs
@@ -237,7 +237,7 @@ fn file_product(p: &Path) -> IoResult<u32> {
 use str;
 use string::String;
 use uint;
-use unicode::UnicodeChar;
+use unicode::char::UnicodeChar;
 use vec::Vec;

 // Reexports

--- a/src/libstd/path/windows.rs
+++ b/src/libstd/path/windows.rs
@@ -24,7 +24,7 @@
 use slice::{Vector, ImmutableVector};
 use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
 use string::String;
-use unicode::UnicodeChar;
+use unicode::char::UnicodeChar;
 use vec::Vec;

 use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};

--- a/src/libstd/prelude.rs
+++ b/src/libstd/prelude.rs
@@ -59,7 +59,7 @@
 #[doc(no_inline)] pub use ascii::{Ascii, AsciiCast, OwnedAsciiCast, AsciiStr};
 #[doc(no_inline)] pub use ascii::IntoBytes;
 #[doc(no_inline)] pub use c_str::ToCStr;
-#[doc(no_inline)] pub use char::Char;
+#[doc(no_inline)] pub use char::{Char, UnicodeChar};
 #[doc(no_inline)] pub use clone::Clone;
 #[doc(no_inline)] pub use cmp::{PartialEq, PartialOrd, Eq, Ord};
 #[doc(no_inline)] pub use cmp::{Ordering, Less, Equal, Greater, Equiv};
@@ -77,7 +77,7 @@
 #[doc(no_inline)] pub use ptr::RawPtr;
 #[doc(no_inline)] pub use io::{Buffer, Writer, Reader, Seek};
 #[doc(no_inline)] pub use str::{Str, StrVector, StrSlice, OwnedStr};
-#[doc(no_inline)] pub use str::{IntoMaybeOwned, StrAllocating};
+#[doc(no_inline)] pub use str::{IntoMaybeOwned, StrAllocating, UnicodeStrSlice};
 #[doc(no_inline)] pub use to_str::{ToString, IntoStr};
 #[doc(no_inline)] pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4};
 #[doc(no_inline)] pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8};
@@ -89,7 +89,6 @@
 #[doc(no_inline)] pub use slice::{Vector, VectorVector};
 #[doc(no_inline)] pub use slice::MutableVectorAllocating;
 #[doc(no_inline)] pub use string::String;
-#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
 #[doc(no_inline)] pub use vec::Vec;

 // Reexported runtime types

--- a/src/libstd/rt/backtrace.rs
+++ b/src/libstd/rt/backtrace.rs
@@ -21,7 +21,7 @@
 use result::{Ok, Err};
 use str::StrSlice;
 use sync::atomics;
-use unicode::UnicodeChar;
+use unicode::char::UnicodeChar;

 pub use self::imp::write;


--- a/src/libunicode/lib.rs
+++ b/src/libunicode/lib.rs
@@ -33,13 +33,9 @@

 extern crate core;

-pub use tables::normalization::canonical_combining_class;
+// regex module
 pub use tables::regex;

-pub use u_char::UnicodeChar;
-pub use u_str::UnicodeStrSlice;
-pub use u_str::Words;
-
 mod decompose;
 mod tables;
 mod u_char;
@@ -66,11 +62,22 @@ pub mod char {
    pub use core::char::{from_digit, escape_unicode, escape_default};
    pub use core::char::{len_utf8_bytes, Char};

-    pub use decompose::decompose_canonical;
-    pub use decompose::decompose_compatible;
+    pub use decompose::{decompose_canonical, decompose_compatible};
+
+    pub use tables::normalization::canonical_combining_class;

    pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
    pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
    pub use u_char::{is_alphanumeric, is_control, is_digit};
    pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
 }
+
+pub mod str {
+    pub use u_str::{UnicodeStrSlice, Words, Graphemes, GraphemeIndices};
+}
+
+// this lets us use #[deriving(Clone)]
+mod std {
+    pub use core::clone;
+    pub use core::cmp;
+}
--- a/src/libunicode/tables.rs
+++ b/src/libunicode/tables.rs
--- a/src/libunicode/u_str.rs
+++ b/src/libunicode/u_str.rs
@@ -15,11 +15,15 @@
 * methods provided by the UnicodeChar trait.
 */

+use core::clone::Clone;
+use core::cmp;
 use core::collections::Collection;
-use core::iter::{Filter};
+use core::iter::{Filter, AdditiveIterator, Iterator, DoubleEndedIterator};
+use core::option::{Option, None, Some};
 use core::str::{CharSplits, StrSlice};
-use core::iter::Iterator;
 use u_char;
+use u_char::UnicodeChar;
+use tables::grapheme::GraphemeCat;

 /// An iterator over the words of a string, separated by a sequence of whitespace
 pub type Words<'a> =
@@ -27,6 +31,36 @@

 /// Methods for Unicode string slices
 pub trait UnicodeStrSlice<'a> {
+    /// Returns an iterator over the
+    /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
+    /// of the string.
+    ///
+    /// If `is_extended` is true, the iterator is over the *extended grapheme clusters*;
+    /// otherwise, the iterator is over the *legacy grapheme clusters*.
+    /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
+    /// recommends extended grapheme cluster boundaries for general processing.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::<Vec<&str>>();
+    /// assert_eq!(gr1.as_slice(), &["a\u0310", "e\u0301", "o\u0308\u0332"]);
+    /// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::<Vec<&str>>();
+    /// assert_eq!(gr2.as_slice(), &["a", "\r\n", "b", "🇷🇺🇸🇹"]);
+    /// ```
+    fn graphemes(&self, is_extended: bool) -> Graphemes<'a>;
+
+    /// Returns an iterator over the grapheme clusters of self and their byte offsets.
+    /// See `graphemes()` method for more information.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::<Vec<(uint, &str)>>();
+    /// assert_eq!(gr_inds.as_slice(), &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]);
+    /// ```
+    fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'a>;
+
    /// An iterator over the words of a string (subsequences separated
    /// by any sequence of whitespace). Sequences of whitespace are
    /// collapsed, so empty "words" are not included.
@@ -78,7 +112,7 @@ pub trait UnicodeStrSlice<'a> {
    /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
    /// recommends that these characters be treated as 1 column (i.e.,
    /// `is_cjk` = `false`) if the locale is unknown.
-    //fn width(&self, is_cjk: bool) -> uint;
+    fn width(&self, is_cjk: bool) -> uint;

    /// Returns a string with leading and trailing whitespace removed.
    fn trim(&self) -> &'a str;
@@ -91,6 +125,16 @@ pub trait UnicodeStrSlice<'a> {
 }

 impl<'a> UnicodeStrSlice<'a> for &'a str {
+    #[inline]
+    fn graphemes(&self, is_extended: bool) -> Graphemes<'a> {
+        Graphemes { string: *self, extended: is_extended, cat: None, catb: None }
+    }
+
+    #[inline]
+    fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'a> {
+        GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
+    }
+
    #[inline]
    fn words(&self) -> Words<'a> {
        self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
@@ -102,6 +146,11 @@ fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
    #[inline]
    fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }

+    #[inline]
+    fn width(&self, is_cjk: bool) -> uint {
+        self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
+    }
+
    #[inline]
    fn trim(&self) -> &'a str {
        self.trim_left().trim_right()
@@ -117,3 +166,257 @@ fn trim_right(&self) -> &'a str {
        self.trim_right_chars(u_char::is_whitespace)
    }
 }
+
+/// External iterator for grapheme clusters and byte offsets.
+#[deriving(Clone)]
+pub struct GraphemeIndices<'a> {
+    start_offset: uint,
+    iter: Graphemes<'a>,
+}
+
+impl<'a> Iterator<(uint, &'a str)> for GraphemeIndices<'a> {
+    #[inline]
+    fn next(&mut self) -> Option<(uint, &'a str)> {
+        self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        self.iter.size_hint()
+    }
+}
+
+impl<'a> DoubleEndedIterator<(uint, &'a str)> for GraphemeIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(uint, &'a str)> {
+        self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
+    }
+}
+
+/// External iterator for a string's
+/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
+#[deriving(Clone)]
+pub struct Graphemes<'a> {
+    string: &'a str,
+    extended: bool,
+    cat: Option<GraphemeCat>,
+    catb: Option<GraphemeCat>,
+}
+
+// state machine for cluster boundary rules
+#[deriving(PartialEq,Eq)]
+enum GraphemeState {
+    Start,
+    FindExtend,
+    HangulL,
+    HangulLV,
+    HangulLVT,
+    Regional,
+}
+
+impl<'a> Iterator<&'a str> for Graphemes<'a> {
+    #[inline]
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        let slen = self.string.len();
+        (cmp::min(slen, 1u), Some(slen))
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        use gr = tables::grapheme;
+        if self.string.len() == 0 {
+            return None;
+        }
+
+        let mut take_curr = true;
+        let mut idx = 0;
+        let mut state = Start;
+        let mut cat = gr::GC_Any;
+        for (curr, ch) in self.string.char_indices() {
+            idx = curr;
+
+            // retrieve cached category, if any
+            // We do this because most of the time we would end up
+            // looking up each character twice.
+            cat = match self.cat {
+                None => gr::grapheme_category(ch),
+                _ => self.cat.take_unwrap()
+            };
+
+            if match cat {
+                gr::GC_Extend => true,
+                gr::GC_SpacingMark if self.extended => true,
+                _ => false
+            } {
+                    state = FindExtend;     // rule GB9/GB9a
+                    continue;
+            }
+
+            state = match state {
+                Start if '\r' == ch => {
+                    let slen = self.string.len();
+                    let nidx = idx + 1;
+                    if nidx != slen && self.string.char_at(nidx) == '\n' {
+                        idx = nidx;             // rule GB3
+                    }
+                    break;                      // rule GB4
+                }
+                Start => match cat {
+                    gr::GC_Control => break,
+                    gr::GC_L => HangulL,
+                    gr::GC_LV | gr::GC_V => HangulLV,
+                    gr::GC_LVT | gr::GC_T => HangulLVT,
+                    gr::GC_RegionalIndicator => Regional,
+                    _ => FindExtend
+                },
+                FindExtend => {         // found non-extending when looking for extending
+                    take_curr = false;
+                    break;
+                },
+                HangulL => match cat {      // rule GB6: L x (L|V|LV|LVT)
+                    gr::GC_L => continue,
+                    gr::GC_LV | gr::GC_V => HangulLV,
+                    gr::GC_LVT => HangulLVT,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                HangulLV => match cat {     // rule GB7: (LV|V) x (V|T)
+                    gr::GC_V => continue,
+                    gr::GC_T => HangulLVT,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                HangulLVT => match cat {    // rule GB8: (LVT|T) x T
+                    gr::GC_T => continue,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Regional => match cat {     // rule GB8a
+                    gr::GC_RegionalIndicator => continue,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        self.cat = if take_curr {
+            idx = self.string.char_range_at(idx).next;
+            None
+        } else {
+            Some(cat)
+        };
+
+        let retstr = self.string.slice_to(idx);
+        self.string = self.string.slice_from(idx);
+        Some(retstr)
+    }
+}
+
+impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'a str> {
+        use gr = tables::grapheme;
+        if self.string.len() == 0 {
+            return None;
+        }
+
+        let mut take_curr = true;
+        let mut idx = self.string.len();
+        let mut previdx = idx;
+        let mut state = Start;
+        let mut cat = gr::GC_Any;
+        for (curr, ch) in self.string.char_indices().rev() {
+            previdx = idx;
+            idx = curr;
+
+            // cached category, if any
+            cat = match self.catb {
+                None => gr::grapheme_category(ch),
+                _ => self.catb.take_unwrap()
+            };
+
+            // a matching state machine that runs *backwards* across an input string
+            // note that this has some implications for the Hangul matching, since
+            // we now need to know what the rightward letter is:
+            //
+            // Right to left, we have:
+            //      L x L
+            //      V x (L|V|LV)
+            //      T x (V|T|LV|LVT)
+            // HangulL means the letter to the right is L
+            // HangulLV means the letter to the right is V
+            // HangulLVT means the letter to the right is T
+            state = match state {
+                Start if '\n' == ch => {
+                    if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
+                        idx -= 1;       // rule GB3
+                    }
+                    break;              // rule GB4
+                },
+                Start | FindExtend => match cat {
+                    gr::GC_Extend => FindExtend,
+                    gr::GC_SpacingMark if self.extended => FindExtend,
+                    gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
+                    gr::GC_V => HangulLV,
+                    gr::GC_T => HangulLVT,
+                    gr::GC_RegionalIndicator => Regional,
+                    gr::GC_Control => {
+                        take_curr = Start == state;
+                        break;
+                    },
+                    _ => break
+                },
+                HangulL => match cat {      // char to right is an L
+                    gr::GC_L => continue,               // L x L is the only legal match
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                HangulLV => match cat {     // char to right is a V
+                    gr::GC_V => continue,               // V x V, right char is still V
+                    gr::GC_L | gr::GC_LV => HangulL,    // (L|V) x V, right char is now L
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                HangulLVT => match cat {    // char to right is a T
+                    gr::GC_T => continue,               // T x T, right char is still T
+                    gr::GC_V => HangulLV,               // V x T, right char is now V
+                    gr::GC_LV | gr::GC_LVT => HangulL,  // (LV|LVT) x T, right char is now L
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                },
+                Regional => match cat {     // rule GB8a
+                    gr::GC_RegionalIndicator => continue,
+                    _ => {
+                        take_curr = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        self.catb = if take_curr {
+            None
+        } else  {
+            idx = previdx;
+            Some(cat)
+        };
+
+        let retstr = self.string.slice_from(idx);
+        self.string = self.string.slice_to(idx);
+        Some(retstr)
+    }
+}