提交 6638e05a 编写于 作者: R Raph Levien

Tailor grapheme boundaries so sequence emoji are one grapheme

Make it so it's not possible to position the cursor inside an emoji
formed by a sequence including zero-width joiners.

Bug: 25368653
Change-Id: I67ec0874cd1505f3c82ab91492ffc3d39a52fae6
上级 73abbd59
......@@ -64,6 +64,19 @@ bool isPureKiller(uint32_t c) {
|| c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
}
// Returns true if the character appears before or after zwj in a zwj emoji sequence. See
// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
bool isZwjEmoji(uint32_t c) {
return (c == 0x2764 // HEAVY BLACK HEART
|| c == 0x1F468 // MAN
|| c == 0x1F469 // WOMAN
|| c == 0x1F48B // KISS MARK
|| c == 0x1F466 // BOY
|| c == 0x1F467 // GIRL
|| c == 0x1F441 // EYE
|| c == 0x1F5E8); // LEFT SPEECH BUBBLE
}
bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
size_t offset) {
// This implementation closely follows Unicode Standard Annex #29 on
......@@ -139,6 +152,19 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
&& u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
return false;
}
// Tailoring: make emoji sequences with ZWJ a single grapheme cluster
if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) {
// look at character before ZWJ to see that both can participate in an emoji zwj sequence
uint32_t c0 = 0;
U16_PREV(buf, start, offset_back, c0);
if (c0 == 0xFE0F && offset_back > start) {
// skip over emoji variation selector
U16_PREV(buf, start, offset_back, c0);
}
if (isZwjEmoji(c0)) {
return false;
}
}
// Rule GB10, Any ÷ Any
return true;
}
......
......@@ -119,6 +119,21 @@ TEST(GraphemeBreak, tailoring) {
EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka
EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer
EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer
// suppress grapheme breaks in zwj emoji sequences, see
// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466"));
EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466"));
EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466"));
EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466"));
EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466"));
EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8"));
// ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break
EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
}
TEST(GraphemeBreak, offsets) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册