提交 30bf8a7c 编写于 作者: R Raph Levien 提交者: Android (Google) Code Review

Merge "Suppress linebreaks in emoji ZWJ sequences" into nyc-dev

......@@ -25,6 +25,7 @@
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
const uint16_t CHAR_ZWJ = 0x200D;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
......@@ -62,6 +63,32 @@ enum ScanState {
SAW_COLON_SLASH_SLASH,
};
/**
* Determine whether a line break at position i within the buffer buf is valid. This
* represents customization beyond the ICU behavior, because plain ICU provides some
* line break opportunities that we don't want.
**/
static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
if (codeUnit == CHAR_SOFT_HYPHEN) {
return false;
}
if (codeUnit == CHAR_ZWJ) {
// Possible emoji ZWJ sequence
uint32_t next_codepoint;
U16_NEXT(buf, i, bufEnd, next_codepoint);
if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
next_codepoint == 0x1F466 || // BOY
next_codepoint == 0x1F467 || // GIRL
next_codepoint == 0x1F468 || // MAN
next_codepoint == 0x1F469 || // WOMAN
next_codepoint == 0x1F48B || // KISS MARK
next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
return false;
}
}
return true;
}
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
return c == ':' || c == '=' || c == '&';
......@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
&& !isBreakValid(mText[result - 1], mText, mTextSize, result));
mCurrent = (ssize_t)result;
return mCurrent;
}
......
......@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, zwjEmojiSequences) {
uint16_t buf[] = {
// man + zwj + heart + zwj + man
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
// woman + zwj + heart + zwj + woman
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
// eye + zwj + left speech bubble
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
EXPECT_EQ(0, breaker.wordStart());
EXPECT_EQ(7, breaker.wordEnd());
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
EXPECT_EQ(7, breaker.wordStart());
EXPECT_EQ(17, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(17, breaker.wordStart());
EXPECT_EQ(22, breaker.wordEnd());
}
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册