diff --git a/libs/minikin/GraphemeBreak.cpp b/libs/minikin/GraphemeBreak.cpp index eca74b178ce5982a99a79b1b0c84b6b164925577..7865d1d04586e3c4a92f07c976e9a66ff4166a5c 100644 --- a/libs/minikin/GraphemeBreak.cpp +++ b/libs/minikin/GraphemeBreak.cpp @@ -64,6 +64,19 @@ bool isPureKiller(uint32_t c) { || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B); } +// Returns true if the character appears before or after zwj in a zwj emoji sequence. See +// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html +bool isZwjEmoji(uint32_t c) { + return (c == 0x2764 // HEAVY BLACK HEART + || c == 0x1F468 // MAN + || c == 0x1F469 // WOMAN + || c == 0x1F48B // KISS MARK + || c == 0x1F466 // BOY + || c == 0x1F467 // GIRL + || c == 0x1F441 // EYE + || c == 0x1F5E8); // LEFT SPEECH BUBBLE +} + bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, size_t offset) { // This implementation closely follows Unicode Standard Annex #29 on @@ -139,6 +152,19 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { return false; } + // Tailoring: make emoji sequences with ZWJ a single grapheme cluster + if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) { + // look at character before ZWJ to see that both can participate in an emoji zwj sequence + uint32_t c0 = 0; + U16_PREV(buf, start, offset_back, c0); + if (c0 == 0xFE0F && offset_back > start) { + // skip over emoji variation selector + U16_PREV(buf, start, offset_back, c0); + } + if (isZwjEmoji(c0)) { + return false; + } + } // Rule GB10, Any รท Any return true; } diff --git a/tests/GraphemeBreakTests.cpp b/tests/GraphemeBreakTests.cpp index 6eda4da9fdb00a04829ba38500559d95f97ac9c9..d6746bc2b093d64c988640e5f100fdc752d45dc5 100644 --- a/tests/GraphemeBreakTests.cpp +++ b/tests/GraphemeBreakTests.cpp @@ -119,6 +119,21 @@ TEST(GraphemeBreak, tailoring) { EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer + + // suppress grapheme breaks in zwj emoji sequences, see + // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html + EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468")); + EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468")); + EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468")); + EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466")); + EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466")); + EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466")); + EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466")); + EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466")); + EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8")); + + // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break + EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764")); } TEST(GraphemeBreak, offsets) {