提交 2692ae1d 编写于 作者: B bors

auto merge of #15619 : kwantam/rust/master, r=huonw

- `width()` computes the displayed width of a string, ignoring the width of control characters.
    - arguably we might do *something* else for control characters, but the question is, what?
    - users who want to do something else can iterate over chars()

- `graphemes()` returns a `Graphemes` struct, which implements an iterator over the grapheme clusters of a &str.
    - fully compliant with [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
    - passes all [Unicode-supplied tests](http://www.unicode.org/reports/tr41/tr41-15.html#Tests29)

- added code to generate additionial categories in `unicode.py`
    - `Cn` aka `Not_Assigned`
    - categories necessary for grapheme cluster breaking

- tidied up the exports from libunicode
  - all exports are exposed through a module rather than directly at crate root.
  - std::prelude imports UnicodeChar and UnicodeStrSlice from std::char and std::str rather than directly from libunicode

closes #7043
......@@ -51,6 +51,30 @@ expanded_categories = {
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
# Grapheme cluster data
# taken from UAX29, http://www.unicode.org/reports/tr29/
# these code points are excluded from the Control category
# NOTE: CR and LF are also technically excluded, but for
# the sake of convenience we leave them in the Control group
# and manually check them in the appropriate place. This is
# still compliant with the implementation requirements.
grapheme_control_exceptions = set([0x200c, 0x200d])
# the Regional_Indicator category
grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)]
# "The following ... are specifically excluded" from the SpacingMark category
# http://www.unicode.org/reports/tr29/#SpacingMark
grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038),
(0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c),
(0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9),
(0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64),
(0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)]
# these are included in the SpacingMark category
grapheme_spacingmark_extra = set([0xe33, 0xeb3])
def fetch(f):
if not os.path.exists(f):
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
......@@ -109,7 +133,7 @@ def load_unicode_data(f):
canon_decomp[code] = seq
# place letter in categories as appropriate
for cat in [gencat] + expanded_categories.get(gencat, []):
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
if cat not in gencats:
gencats[cat] = []
gencats[cat].append(code)
......@@ -120,6 +144,12 @@ def load_unicode_data(f):
combines[combine] = []
combines[combine].append(code)
# generate Not_Assigned from Assigned
gencats["Cn"] = gen_unassigned(gencats["Assigned"])
# Assigned is not a real category
del(gencats["Assigned"])
# Other contains Not_Assigned
gencats["C"].extend(gencats["Cn"])
gencats = group_cats(gencats)
combines = to_combines(group_cats(combines))
......@@ -155,6 +185,11 @@ def ungroup_cat(cat):
lo += 1
return cat_out
def gen_unassigned(assigned):
assigned = set(assigned)
return ([i for i in range(0, 0xd800) if i not in assigned] +
[i for i in range(0xe000, 0x110000) if i not in assigned])
def to_combines(combs):
combs_out = []
for comb in combs:
......@@ -350,6 +385,45 @@ def emit_conversions_module(f, lowerupper, upperlower):
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
f.write("}\n\n")
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
f.write("""pub mod grapheme {
use core::option::{Some, None};
use core::slice::ImmutableVector;
#[allow(non_camel_case_types)]
#[deriving(Clone)]
pub enum GraphemeCat {
""")
for cat in grapheme_cats + ["Any"]:
f.write(" GC_" + cat + ",\n")
f.write(""" }
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
use core::cmp::{Equal, Less, Greater};
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, cat) = r[idx];
cat
}
None => GC_Any
}
}
pub fn grapheme_category(c: char) -> GraphemeCat {
bsearch_range_value_table(c, grapheme_cat_table)
}
""")
emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
is_pub=False)
f.write("}\n")
def emit_charwidth_module(f, width_table):
f.write("pub mod charwidth {\n")
f.write(" use core::option::{Option, Some, None};\n")
......@@ -388,7 +462,7 @@ def emit_charwidth_module(f, width_table):
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
f.write("}\n")
f.write("}\n\n")
def emit_norm_module(f, canon, compat, combine):
canon_keys = canon.keys()
......@@ -473,6 +547,8 @@ def remove_from_wtable(wtable, val):
wtable_out.extend(wtable)
return wtable_out
def optimize_width_table(wtable):
wtable_out = []
w_this = wtable.pop(0)
......@@ -487,7 +563,7 @@ def optimize_width_table(wtable):
return wtable_out
if __name__ == "__main__":
r = "unicode.rs"
r = "tables.rs"
if os.path.exists(r):
os.remove(r)
with open(r, "w") as rf:
......@@ -498,12 +574,18 @@ if __name__ == "__main__":
(canon_decomp, compat_decomp, gencats, combines,
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
other_derived = ["Default_Ignorable_Code_Point"]
other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
# grapheme cluster category from DerivedCoreProperties
# the rest are defined below
grapheme_cats = {}
grapheme_cats["Extend"] = derived["Grapheme_Extend"]
del(derived["Grapheme_Extend"])
# bsearch_range_table is used in all the property modules below
emit_bsearch_range_table(rf)
......@@ -533,7 +615,7 @@ if __name__ == "__main__":
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
emit_conversions_module(rf, lowerupper, upperlower)
# character width module
### character width module
width_table = []
for zwcat in ["Me", "Mn", "Cf"]:
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
......@@ -555,3 +637,40 @@ if __name__ == "__main__":
# optimize the width table by collapsing adjacent entities when possible
width_table = optimize_width_table(width_table)
emit_charwidth_module(rf, width_table)
### grapheme cluster module
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
# Hangul syllable categories
want_hangul = ["L", "V", "T", "LV", "LVT"]
grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul))
# Control
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
grapheme_cats["Control"] = set()
for cat in ["Zl", "Zp", "Cc", "Cf"]:
grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat]))
grapheme_cats["Control"] = group_cat(list(
grapheme_cats["Control"]
- grapheme_control_exceptions
| (set(ungroup_cat(gencats["Cn"]))
& set(ungroup_cat(derived["Default_Ignorable_Code_Point"])))))
# Regional Indicator
grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator
# Prepend - "Currently there are no characters with this value"
# (from UAX#29, Unicode 7.0)
# SpacingMark
grapheme_cats["SpacingMark"] = group_cat(list(
set(ungroup_cat(gencats["Mc"]))
- set(ungroup_cat(grapheme_cats["Extend"]))
| grapheme_spacingmark_extra
- set(ungroup_cat(grapheme_spacingmark_exceptions))))
grapheme_table = []
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
grapheme_table.sort(key=lambda w: w[0])
emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())
此差异已折叠。
......@@ -237,7 +237,7 @@ fn file_product(p: &Path) -> IoResult<u32> {
use str;
use string::String;
use uint;
use unicode::UnicodeChar;
use unicode::char::UnicodeChar;
use vec::Vec;
// Reexports
......
......@@ -24,7 +24,7 @@
use slice::{Vector, ImmutableVector};
use str::{CharSplits, Str, StrAllocating, StrVector, StrSlice};
use string::String;
use unicode::UnicodeChar;
use unicode::char::UnicodeChar;
use vec::Vec;
use super::{contains_nul, BytesContainer, GenericPath, GenericPathUnsafe};
......
......@@ -59,7 +59,7 @@
#[doc(no_inline)] pub use ascii::{Ascii, AsciiCast, OwnedAsciiCast, AsciiStr};
#[doc(no_inline)] pub use ascii::IntoBytes;
#[doc(no_inline)] pub use c_str::ToCStr;
#[doc(no_inline)] pub use char::Char;
#[doc(no_inline)] pub use char::{Char, UnicodeChar};
#[doc(no_inline)] pub use clone::Clone;
#[doc(no_inline)] pub use cmp::{PartialEq, PartialOrd, Eq, Ord};
#[doc(no_inline)] pub use cmp::{Ordering, Less, Equal, Greater, Equiv};
......@@ -77,7 +77,7 @@
#[doc(no_inline)] pub use ptr::RawPtr;
#[doc(no_inline)] pub use io::{Buffer, Writer, Reader, Seek};
#[doc(no_inline)] pub use str::{Str, StrVector, StrSlice, OwnedStr};
#[doc(no_inline)] pub use str::{IntoMaybeOwned, StrAllocating};
#[doc(no_inline)] pub use str::{IntoMaybeOwned, StrAllocating, UnicodeStrSlice};
#[doc(no_inline)] pub use to_str::{ToString, IntoStr};
#[doc(no_inline)] pub use tuple::{Tuple1, Tuple2, Tuple3, Tuple4};
#[doc(no_inline)] pub use tuple::{Tuple5, Tuple6, Tuple7, Tuple8};
......@@ -89,7 +89,6 @@
#[doc(no_inline)] pub use slice::{Vector, VectorVector};
#[doc(no_inline)] pub use slice::MutableVectorAllocating;
#[doc(no_inline)] pub use string::String;
#[doc(no_inline)] pub use unicode::{UnicodeChar, UnicodeStrSlice};
#[doc(no_inline)] pub use vec::Vec;
// Reexported runtime types
......
......@@ -21,7 +21,7 @@
use result::{Ok, Err};
use str::StrSlice;
use sync::atomics;
use unicode::UnicodeChar;
use unicode::char::UnicodeChar;
pub use self::imp::write;
......
......@@ -33,13 +33,9 @@
extern crate core;
pub use tables::normalization::canonical_combining_class;
// regex module
pub use tables::regex;
pub use u_char::UnicodeChar;
pub use u_str::UnicodeStrSlice;
pub use u_str::Words;
mod decompose;
mod tables;
mod u_char;
......@@ -66,11 +62,22 @@ pub mod char {
pub use core::char::{from_digit, escape_unicode, escape_default};
pub use core::char::{len_utf8_bytes, Char};
pub use decompose::decompose_canonical;
pub use decompose::decompose_compatible;
pub use decompose::{decompose_canonical, decompose_compatible};
pub use tables::normalization::canonical_combining_class;
pub use u_char::{is_alphabetic, is_XID_start, is_XID_continue};
pub use u_char::{is_lowercase, is_uppercase, is_whitespace};
pub use u_char::{is_alphanumeric, is_control, is_digit};
pub use u_char::{to_uppercase, to_lowercase, width, UnicodeChar};
}
pub mod str {
pub use u_str::{UnicodeStrSlice, Words, Graphemes, GraphemeIndices};
}
// this lets us use #[deriving(Clone)]
mod std {
pub use core::clone;
pub use core::cmp;
}
此差异已折叠。
......@@ -15,11 +15,15 @@
* methods provided by the UnicodeChar trait.
*/
use core::clone::Clone;
use core::cmp;
use core::collections::Collection;
use core::iter::{Filter};
use core::iter::{Filter, AdditiveIterator, Iterator, DoubleEndedIterator};
use core::option::{Option, None, Some};
use core::str::{CharSplits, StrSlice};
use core::iter::Iterator;
use u_char;
use u_char::UnicodeChar;
use tables::grapheme::GraphemeCat;
/// An iterator over the words of a string, separated by a sequence of whitespace
pub type Words<'a> =
......@@ -27,6 +31,36 @@
/// Methods for Unicode string slices
pub trait UnicodeStrSlice<'a> {
/// Returns an iterator over the
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
/// of the string.
///
/// If `is_extended` is true, the iterator is over the *extended grapheme clusters*;
/// otherwise, the iterator is over the *legacy grapheme clusters*.
/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
/// recommends extended grapheme cluster boundaries for general processing.
///
/// # Example
///
/// ```rust
/// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::<Vec<&str>>();
/// assert_eq!(gr1.as_slice(), &["a\u0310", "e\u0301", "o\u0308\u0332"]);
/// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::<Vec<&str>>();
/// assert_eq!(gr2.as_slice(), &["a", "\r\n", "b", "🇷🇺🇸🇹"]);
/// ```
fn graphemes(&self, is_extended: bool) -> Graphemes<'a>;
/// Returns an iterator over the grapheme clusters of self and their byte offsets.
/// See `graphemes()` method for more information.
///
/// # Example
///
/// ```rust
/// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::<Vec<(uint, &str)>>();
/// assert_eq!(gr_inds.as_slice(), &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]);
/// ```
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'a>;
/// An iterator over the words of a string (subsequences separated
/// by any sequence of whitespace). Sequences of whitespace are
/// collapsed, so empty "words" are not included.
......@@ -78,7 +112,7 @@ pub trait UnicodeStrSlice<'a> {
/// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// recommends that these characters be treated as 1 column (i.e.,
/// `is_cjk` = `false`) if the locale is unknown.
//fn width(&self, is_cjk: bool) -> uint;
fn width(&self, is_cjk: bool) -> uint;
/// Returns a string with leading and trailing whitespace removed.
fn trim(&self) -> &'a str;
......@@ -91,6 +125,16 @@ pub trait UnicodeStrSlice<'a> {
}
impl<'a> UnicodeStrSlice<'a> for &'a str {
#[inline]
fn graphemes(&self, is_extended: bool) -> Graphemes<'a> {
Graphemes { string: *self, extended: is_extended, cat: None, catb: None }
}
#[inline]
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'a> {
GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
}
#[inline]
fn words(&self) -> Words<'a> {
self.split(u_char::is_whitespace).filter(|s| !s.is_empty())
......@@ -102,6 +146,11 @@ fn is_whitespace(&self) -> bool { self.chars().all(u_char::is_whitespace) }
#[inline]
fn is_alphanumeric(&self) -> bool { self.chars().all(u_char::is_alphanumeric) }
#[inline]
fn width(&self, is_cjk: bool) -> uint {
self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
}
#[inline]
fn trim(&self) -> &'a str {
self.trim_left().trim_right()
......@@ -117,3 +166,257 @@ fn trim_right(&self) -> &'a str {
self.trim_right_chars(u_char::is_whitespace)
}
}
/// External iterator for grapheme clusters and byte offsets.
#[deriving(Clone)]
pub struct GraphemeIndices<'a> {
start_offset: uint,
iter: Graphemes<'a>,
}
impl<'a> Iterator<(uint, &'a str)> for GraphemeIndices<'a> {
#[inline]
fn next(&mut self) -> Option<(uint, &'a str)> {
self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (uint, Option<uint>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator<(uint, &'a str)> for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(uint, &'a str)> {
self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
}
}
/// External iterator for a string's
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
#[deriving(Clone)]
pub struct Graphemes<'a> {
string: &'a str,
extended: bool,
cat: Option<GraphemeCat>,
catb: Option<GraphemeCat>,
}
// state machine for cluster boundary rules
#[deriving(PartialEq,Eq)]
enum GraphemeState {
Start,
FindExtend,
HangulL,
HangulLV,
HangulLVT,
Regional,
}
impl<'a> Iterator<&'a str> for Graphemes<'a> {
#[inline]
fn size_hint(&self) -> (uint, Option<uint>) {
let slen = self.string.len();
(cmp::min(slen, 1u), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use gr = tables::grapheme;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut idx = 0;
let mut state = Start;
let mut cat = gr::GC_Any;
for (curr, ch) in self.string.char_indices() {
idx = curr;
// retrieve cached category, if any
// We do this because most of the time we would end up
// looking up each character twice.
cat = match self.cat {
None => gr::grapheme_category(ch),
_ => self.cat.take_unwrap()
};
if match cat {
gr::GC_Extend => true,
gr::GC_SpacingMark if self.extended => true,
_ => false
} {
state = FindExtend; // rule GB9/GB9a
continue;
}
state = match state {
Start if '\r' == ch => {
let slen = self.string.len();
let nidx = idx + 1;
if nidx != slen && self.string.char_at(nidx) == '\n' {
idx = nidx; // rule GB3
}
break; // rule GB4
}
Start => match cat {
gr::GC_Control => break,
gr::GC_L => HangulL,
gr::GC_LV | gr::GC_V => HangulLV,
gr::GC_LVT | gr::GC_T => HangulLVT,
gr::GC_RegionalIndicator => Regional,
_ => FindExtend
},
FindExtend => { // found non-extending when looking for extending
take_curr = false;
break;
},
HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
gr::GC_L => continue,
gr::GC_LV | gr::GC_V => HangulLV,
gr::GC_LVT => HangulLVT,
_ => {
take_curr = false;
break;
}
},
HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
gr::GC_V => continue,
gr::GC_T => HangulLVT,
_ => {
take_curr = false;
break;
}
},
HangulLVT => match cat { // rule GB8: (LVT|T) x T
gr::GC_T => continue,
_ => {
take_curr = false;
break;
}
},
Regional => match cat { // rule GB8a
gr::GC_RegionalIndicator => continue,
_ => {
take_curr = false;
break;
}
}
}
}
self.cat = if take_curr {
idx = self.string.char_range_at(idx).next;
None
} else {
Some(cat)
};
let retstr = self.string.slice_to(idx);
self.string = self.string.slice_from(idx);
Some(retstr)
}
}
impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use gr = tables::grapheme;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut idx = self.string.len();
let mut previdx = idx;
let mut state = Start;
let mut cat = gr::GC_Any;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
// cached category, if any
cat = match self.catb {
None => gr::grapheme_category(ch),
_ => self.catb.take_unwrap()
};
// a matching state machine that runs *backwards* across an input string
// note that this has some implications for the Hangul matching, since
// we now need to know what the rightward letter is:
//
// Right to left, we have:
// L x L
// V x (L|V|LV)
// T x (V|T|LV|LVT)
// HangulL means the letter to the right is L
// HangulLV means the letter to the right is V
// HangulLVT means the letter to the right is T
state = match state {
Start if '\n' == ch => {
if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
idx -= 1; // rule GB3
}
break; // rule GB4
},
Start | FindExtend => match cat {
gr::GC_Extend => FindExtend,
gr::GC_SpacingMark if self.extended => FindExtend,
gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
gr::GC_V => HangulLV,
gr::GC_T => HangulLVT,
gr::GC_RegionalIndicator => Regional,
gr::GC_Control => {
take_curr = Start == state;
break;
},
_ => break
},
HangulL => match cat { // char to right is an L
gr::GC_L => continue, // L x L is the only legal match
_ => {
take_curr = false;
break;
}
},
HangulLV => match cat { // char to right is a V
gr::GC_V => continue, // V x V, right char is still V
gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
_ => {
take_curr = false;
break;
}
},
HangulLVT => match cat { // char to right is a T
gr::GC_T => continue, // T x T, right char is still T
gr::GC_V => HangulLV, // V x T, right char is now V
gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
_ => {
take_curr = false;
break;
}
},
Regional => match cat { // rule GB8a
gr::GC_RegionalIndicator => continue,
_ => {
take_curr = false;
break;
}
}
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
Some(cat)
};
let retstr = self.string.slice_from(idx);
self.string = self.string.slice_to(idx);
Some(retstr)
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册