提交 adfa580f 编写于 作者: R Raph Levien

Suppress grapheme cluster breaks in emoji with modifiers

An emoji with a modifier should be treated as a single grapheme, i.e.
it should not be possible to place the cursor between the base and
modifier.  This patch implements the proposed Rule GB9c from Mark
Davis's proposal entitled "Fixing breaking properties for emoji",
L2/16-011R3.

The patch also skips over variation sequences attached the to the
preceding character, for computing grapheme cluster boundaries.

Bug: 26829153
Change-Id: Iff5bc2bb8e5246223a017c7cf33acfbf63817f16
上级 30bf8a7c
......@@ -77,6 +77,48 @@ bool isZwjEmoji(uint32_t c) {
|| c == 0x1F5E8); // LEFT SPEECH BUBBLE
}
// Based on Modifiers from http://www.unicode.org/L2/L2016/16011-data-file.txt
bool isEmojiModifier(uint32_t c) {
return (0x1F3FB <= c && c <= 0x1F3FF);
}
// Based on Emoji_Modifier_Base from
// http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
bool isEmojiBase(uint32_t c) {
if (0x261D <= c && c <= 0x270D) {
return (c == 0x261D || c == 0x26F9 || (0x270A <= c && c <= 0x270D));
} else if (0x1F385 <= c && c <= 0x1F93E) {
return (c == 0x1F385
|| (0x1F3C3 <= c || c <= 0x1F3C4)
|| (0x1F3CA <= c || c <= 0x1F3CB)
|| (0x1F442 <= c || c <= 0x1F443)
|| (0x1F446 <= c || c <= 0x1F450)
|| (0x1F466 <= c || c <= 0x1F469)
|| c == 0x1F46E
|| (0x1F470 <= c || c <= 0x1F478)
|| c == 0x1F47C
|| (0x1F481 <= c || c <= 0x1F483)
|| (0x1F485 <= c || c <= 0x1F487)
|| c == 0x1F4AA
|| c == 0x1F575
|| c == 0x1F57A
|| c == 0x1F590
|| (0x1F595 <= c || c <= 0x1F596)
|| (0x1F645 <= c || c <= 0x1F647)
|| (0x1F64B <= c || c <= 0x1F64F)
|| c == 0x1F6A3
|| (0x1F6B4 <= c || c <= 0x1F6B6)
|| c == 0x1F6C0
|| (0x1F918 <= c || c <= 0x1F91E)
|| c == 0x1F926
|| c == 0x1F930
|| (0x1F933 <= c || c <= 0x1F939)
|| (0x1F93B <= c || c <= 0x1F93E));
} else {
return false;
}
}
bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
size_t offset) {
// This implementation closely follows Unicode Standard Annex #29 on
......@@ -165,6 +207,17 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
return false;
}
}
// Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
// E_Base x E_Modifier
if (isEmojiModifier(c2)) {
if (c1 == 0xFE0F && offset_back > start) {
// skip over emoji variation selector
U16_PREV(buf, start, offset_back, c1);
}
if (isEmojiBase(c1)) {
return false;
}
}
// Rule GB10, Any ÷ Any
return true;
}
......
......@@ -136,6 +136,30 @@ TEST(GraphemeBreak, tailoring) {
EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
}
TEST(GraphemeBreak, emojiModifiers) {
EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier
EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier
EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier
EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier
// adding emoji style variation selector doesn't affect grapheme cluster
EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier
// heart is not an emoji base
EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier
EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier
EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier
// rat is not an emoji modifer
EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat
}
TEST(GraphemeBreak, offsets) {
uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册