提交 94d55255 编写于 作者: R Raph Levien 提交者: Android (Google) Code Review

Merge "Suppress grapheme cluster breaks in emoji with modifiers" into nyc-dev

......@@ -77,6 +77,48 @@ bool isZwjEmoji(uint32_t c) {
|| c == 0x1F5E8); // LEFT SPEECH BUBBLE
}
// Based on Modifiers from http://www.unicode.org/L2/L2016/16011-data-file.txt
bool isEmojiModifier(uint32_t c) {
return (0x1F3FB <= c && c <= 0x1F3FF);
}
// Based on Emoji_Modifier_Base from
// http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
bool isEmojiBase(uint32_t c) {
if (0x261D <= c && c <= 0x270D) {
return (c == 0x261D || c == 0x26F9 || (0x270A <= c && c <= 0x270D));
} else if (0x1F385 <= c && c <= 0x1F93E) {
return (c == 0x1F385
|| (0x1F3C3 <= c || c <= 0x1F3C4)
|| (0x1F3CA <= c || c <= 0x1F3CB)
|| (0x1F442 <= c || c <= 0x1F443)
|| (0x1F446 <= c || c <= 0x1F450)
|| (0x1F466 <= c || c <= 0x1F469)
|| c == 0x1F46E
|| (0x1F470 <= c || c <= 0x1F478)
|| c == 0x1F47C
|| (0x1F481 <= c || c <= 0x1F483)
|| (0x1F485 <= c || c <= 0x1F487)
|| c == 0x1F4AA
|| c == 0x1F575
|| c == 0x1F57A
|| c == 0x1F590
|| (0x1F595 <= c || c <= 0x1F596)
|| (0x1F645 <= c || c <= 0x1F647)
|| (0x1F64B <= c || c <= 0x1F64F)
|| c == 0x1F6A3
|| (0x1F6B4 <= c || c <= 0x1F6B6)
|| c == 0x1F6C0
|| (0x1F918 <= c || c <= 0x1F91E)
|| c == 0x1F926
|| c == 0x1F930
|| (0x1F933 <= c || c <= 0x1F939)
|| (0x1F93B <= c || c <= 0x1F93E));
} else {
return false;
}
}
bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
size_t offset) {
// This implementation closely follows Unicode Standard Annex #29 on
......@@ -165,6 +207,17 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
return false;
}
}
// Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
// E_Base x E_Modifier
if (isEmojiModifier(c2)) {
if (c1 == 0xFE0F && offset_back > start) {
// skip over emoji variation selector
U16_PREV(buf, start, offset_back, c1);
}
if (isEmojiBase(c1)) {
return false;
}
}
// Rule GB10, Any ÷ Any
return true;
}
......
......@@ -136,6 +136,30 @@ TEST(GraphemeBreak, tailoring) {
EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
}
TEST(GraphemeBreak, emojiModifiers) {
EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier
EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier
EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier
// adding emoji style variation selector doesn't affect grapheme cluster
EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier
// heart is not an emoji base
EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier
EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier
EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier
// rat is not an emoji modifer
EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat
}
TEST(GraphemeBreak, offsets) {
uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册