diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp index ca69a50307ce52767ca5893ffbde0a83dfb600e0..ec84c39f9a7e3b605f3398e2a7102113535e7dd8 100644 --- a/libs/minikin/WordBreaker.cpp +++ b/libs/minikin/WordBreaker.cpp @@ -25,6 +25,7 @@ namespace android { const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; +const uint16_t CHAR_ZWJ = 0x200D; void WordBreaker::setLocale(const icu::Locale& locale) { UErrorCode status = U_ZERO_ERROR; @@ -62,6 +63,32 @@ enum ScanState { SAW_COLON_SLASH_SLASH, }; +/** + * Determine whether a line break at position i within the buffer buf is valid. This + * represents customization beyond the ICU behavior, because plain ICU provides some + * line break opportunities that we don't want. + **/ +static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) { + if (codeUnit == CHAR_SOFT_HYPHEN) { + return false; + } + if (codeUnit == CHAR_ZWJ) { + // Possible emoji ZWJ sequence + uint32_t next_codepoint; + U16_NEXT(buf, i, bufEnd, next_codepoint); + if (next_codepoint == 0x2764 || // HEAVY BLACK HEART + next_codepoint == 0x1F466 || // BOY + next_codepoint == 0x1F467 || // GIRL + next_codepoint == 0x1F468 || // MAN + next_codepoint == 0x1F469 || // WOMAN + next_codepoint == 0x1F48B || // KISS MARK + next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE + return false; + } + } + return true; +} + // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses static bool breakAfter(uint16_t c) { return c == ':' || c == '=' || c == '&'; @@ -149,7 +176,7 @@ ssize_t WordBreaker::next() { result = mBreakIterator->next(); } } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize - && mText[result - 1] == CHAR_SOFT_HYPHEN); + && !isBreakValid(mText[result - 1], mText, mTextSize, result)); mCurrent = (ssize_t)result; return mCurrent; } diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp index 9662b2f7a4f5cf03f690e41dde13c959bef875c6..6c5e4795c89ac060161785a21c1f8f8ad62dc80c 100644 --- a/tests/WordBreakerTests.cpp +++ b/tests/WordBreakerTests.cpp @@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) { EXPECT_EQ(0, breaker.breakBadness()); } +TEST_F(WordBreakerTest, zwjEmojiSequences) { + uint16_t buf[] = { + // man + zwj + heart + zwj + man + 0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68, + // woman + zwj + heart + zwj + woman + 0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69, + // eye + zwj + left speech bubble + 0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8, + }; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man + EXPECT_EQ(0, breaker.wordStart()); + EXPECT_EQ(7, breaker.wordEnd()); + EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman + EXPECT_EQ(7, breaker.wordStart()); + EXPECT_EQ(17, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(17, breaker.wordStart()); + EXPECT_EQ(22, breaker.wordEnd()); +} + TEST_F(WordBreakerTest, punct) { uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'};