提交 8bdd9b94 编写于 作者: R Roozbeh Pournader

Refactor WordBreaker

Refactor WordBreaker to make it ready for more complex behavior.

Test: existing unit tests continue to pass
Change-Id: Ife758f3e2cf48922ab56109e6c5d3cffa3673feb
上级 f698f16c
......@@ -55,6 +55,10 @@ public:
void finish();
private:
int32_t iteratorNext();
void detectEmailOrUrl();
ssize_t findNextBreakInEmailOrUrl();
std::unique_ptr<icu::BreakIterator> mBreakIterator;
UText mUText = UTEXT_INITIALIZER;
const uint16_t* mText = nullptr;
......
......@@ -58,14 +58,6 @@ ssize_t WordBreaker::current() const {
return mCurrent;
}
enum ScanState {
START,
SAW_AT,
SAW_COLON,
SAW_COLON_SLASH,
SAW_COLON_SLASH_SLASH,
};
/**
* Determine whether a line break at position i within the buffer buf is valid. This
* represents customization beyond the ICU behavior, because plain ICU provides some
......@@ -120,6 +112,22 @@ static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
return true;
}
// Customized iteratorNext that takes care of both resets and our modifications
// to ICU's behavior.
int32_t WordBreaker::iteratorNext() {
int32_t result;
do {
if (mIteratorWasReset) {
result = mBreakIterator->following(mCurrent);
mIteratorWasReset = false;
} else {
result = mBreakIterator->next();
}
} while (!(result == icu::BreakIterator::DONE || (size_t)result == mTextSize
|| isBreakValid(mText, mTextSize, result)));
return result;
}
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
return c == ':' || c == '=' || c == '&';
......@@ -131,9 +139,15 @@ static bool breakBefore(uint16_t c) {
|| c == '%' || c == '=' || c == '&';
}
ssize_t WordBreaker::next() {
mLast = mCurrent;
enum ScanState {
START,
SAW_AT,
SAW_COLON,
SAW_COLON_SLASH,
SAW_COLON_SLASH_SLASH,
};
void WordBreaker::detectEmailOrUrl() {
// scan forward from current ICU position for email address or URL
if (mLast >= mScanOffset) {
ScanState state = START;
......@@ -158,6 +172,9 @@ ssize_t WordBreaker::next() {
}
if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
if (!mBreakIterator->isBoundary(i)) {
// If there are combining marks or such at the end of the URL or the email address,
// consider them a part of the URL or the email, and skip to the next actual
// boundary.
i = mBreakIterator->following(i);
}
mInEmailOrUrl = true;
......@@ -167,48 +184,46 @@ ssize_t WordBreaker::next() {
}
mScanOffset = i;
}
}
if (mInEmailOrUrl) {
// special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
uint16_t lastChar = mText[mLast];
ssize_t i;
for (i = mLast + 1; i < mScanOffset; i++) {
if (breakAfter(lastChar)) {
ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
// special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
uint16_t lastChar = mText[mLast];
ssize_t i;
for (i = mLast + 1; i < mScanOffset; i++) {
if (breakAfter(lastChar)) {
break;
}
// break after double slash
if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
break;
}
const uint16_t thisChar = mText[i];
// never break after hyphen
if (lastChar != '-') {
if (breakBefore(thisChar)) {
break;
}
// break after double slash
if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
// break before single slash
if (thisChar == '/' && lastChar != '/' &&
!(i + 1 < mScanOffset && mText[i + 1] == '/')) {
break;
}
uint16_t thisChar = mText[i];
// never break after hyphen
if (lastChar != '-') {
if (breakBefore(thisChar)) {
break;
}
// break before single slash
if (thisChar == '/' && lastChar != '/' &&
!(i + 1 < mScanOffset && mText[i + 1] == '/')) {
break;
}
}
lastChar = thisChar;
}
mCurrent = i;
return mCurrent;
lastChar = thisChar;
}
return i;
}
int32_t result;
do {
if (mIteratorWasReset) {
result = mBreakIterator->following(mCurrent);
mIteratorWasReset = false;
} else {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
&& !isBreakValid(mText, mTextSize, result));
mCurrent = (ssize_t)result;
ssize_t WordBreaker::next() {
mLast = mCurrent;
detectEmailOrUrl();
if (mInEmailOrUrl) {
mCurrent = findNextBreakInEmailOrUrl();
} else { // Business as usual
mCurrent = (ssize_t) iteratorNext();
}
return mCurrent;
}
......@@ -221,7 +236,7 @@ ssize_t WordBreaker::wordStart() const {
UChar32 c;
ssize_t ix = result;
U16_NEXT(mText, ix, mCurrent, c);
int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
// strip leading punctuation, defined as OP and QU line breaking classes,
// see UAX #14
if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
......@@ -241,7 +256,7 @@ ssize_t WordBreaker::wordEnd() const {
UChar32 c;
ssize_t ix = result;
U16_PREV(mText, mLast, ix, c);
int32_t gc_mask = U_GET_GC_MASK(c);
const int32_t gc_mask = U_GET_GC_MASK(c);
// strip trailing space and punctuation
if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
break;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册