提交 57b6dae9 编写于 作者: R Raph Levien

Refine hyphenation around punctuation

Implement a WordBreaker that defines our concept of valid word
boundaries, customizing the ICU behavior. Currently, we suppress line
breaks at soft hyphens (these are handled specially). Also, the
new WordBreaker class has methods that determine the start and end
of the word (punctuation stripped) for the purpose of hyphenation.

This patch, in its current form, doesn't handle email addresses and
URLs specially, but the WordBreaker class is the correct place to do
so. Also, special case handling of hyphens and dashes is still done
in LineBreaker, but all of that should be moved to WordBreaker.

Bug: 20126487
Bug: 20566159
Change-Id: I492cbad963f9b74a2915f010dad46bb91f97b2fe
上级 070633ad
......@@ -27,6 +27,7 @@
#include <cmath>
#include <vector>
#include "minikin/Hyphenator.h"
#include "minikin/WordBreaker.h"
namespace android {
......@@ -102,11 +103,6 @@ class LineBreaker {
public:
const static int kTab_Shift = 29; // keep synchronized with TAB_MASK in StaticLayout.java
~LineBreaker() {
utext_close(&mUText);
delete mBreakIterator;
}
// Note: Locale persists across multiple invocations (it is not cleaned up by finish()),
// explicitly to avoid the cost of creating ICU BreakIterator objects. It should always
// be set on the first invocation, but callers are encouraged not to call again unless
......@@ -214,8 +210,7 @@ class LineBreaker {
void finishBreaksOptimal();
icu::BreakIterator* mBreakIterator = nullptr;
UText mUText = UTEXT_INITIALIZER;
WordBreaker mWordBreaker;
std::vector<uint16_t>mTextBuf;
std::vector<float>mCharWidths;
......
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A wrapper around ICU's line break iterator, that gives customized line
* break opportunities, as well as identifying words for the purpose of
* hyphenation.
*/
#ifndef MINIKIN_WORD_BREAKER_H
#define MINIKIN_WORD_BREAKER_H
#include "unicode/brkiter.h"
#include <memory>
namespace android {
class WordBreaker {
public:
~WordBreaker() {
finish();
}
void setLocale(const icu::Locale& locale);
void setText(const uint16_t* data, size_t size);
// Advance iterator to next word break. Return offset, or -1 if EOT
ssize_t next();
// Current offset of iterator, equal to 0 at BOT or last return from next()
ssize_t current() const;
// After calling next(), wordStart() and wordEnd() are offsets defining the previous
// word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
ssize_t wordStart() const;
ssize_t wordEnd() const;
void finish();
private:
std::unique_ptr<icu::BreakIterator> mBreakIterator;
UText mUText = UTEXT_INITIALIZER;
const uint16_t* mText = nullptr;
size_t mTextSize;
ssize_t mLast;
ssize_t mCurrent;
bool mIteratorWasReset;
};
} // namespace
#endif // MINIKIN_WORD_BREAKER_H
......@@ -33,7 +33,8 @@ minikin_src_files := \
MinikinInternal.cpp \
MinikinRefCounted.cpp \
MinikinFontFreeType.cpp \
SparseBitSet.cpp
SparseBitSet.cpp \
WordBreaker.cpp
minikin_c_includes := \
external/harfbuzz_ng/src \
......
......@@ -29,7 +29,6 @@ using std::vector;
namespace android {
const int CHAR_TAB = 0x0009;
const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
// constants are larger than any reasonable actual width score.
......@@ -55,23 +54,16 @@ const size_t LONGEST_HYPHENATED_WORD = 45;
const size_t MAX_TEXT_BUF_RETAIN = 32678;
void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
delete mBreakIterator;
UErrorCode status = U_ZERO_ERROR;
mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
// TODO: check status
mWordBreaker.setLocale(locale);
// TODO: load actual resource dependent on locale; letting Minikin do it is a hack
mHyphenator = hyphenator;
}
void LineBreaker::setText() {
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
mBreakIterator->setText(&mUText, status);
mBreakIterator->first();
mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());
// handle initial break here because addStyleRun may never be called
mBreakIterator->next();
mWordBreaker.next();
mCandidates.clear();
Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
mCandidates.push_back(cand);
......@@ -151,8 +143,8 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
}
size_t current = (size_t)mBreakIterator->current();
size_t wordEnd = start;
size_t current = (size_t)mWordBreaker.current();
size_t afterWord = start;
size_t lastBreak = start;
ParaWidth lastBreakWidth = mWidth;
ParaWidth postBreak = mWidth;
......@@ -170,58 +162,56 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
mWidth += mCharWidths[i];
if (!isLineEndSpace(c)) {
postBreak = mWidth;
wordEnd = i + 1;
afterWord = i + 1;
}
}
if (i + 1 == current) {
// Override ICU's treatment of soft hyphen as a break opportunity, because we want it
// to be a hyphen break, with penalty and drawing behavior.
if (c != CHAR_SOFT_HYPHEN) {
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
// we can pass the whole word down to Hyphenator like the soft hyphen case.
bool wordEndsInHyphen = isLineBreakingHyphen(c);
if (paint != nullptr && mHyphenator != nullptr &&
mHyphenationFrequency != kHyphenationFrequency_None &&
!wordEndsInHyphen && !temporarilySkipHyphenation &&
wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
#if VERBOSE_DEBUG
std::string hyphenatedString;
for (size_t j = lastBreak; j < wordEnd; j++) {
if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
// Note: only works with ASCII, should do UTF-8 conversion here
hyphenatedString.push_back(buffer()[j]);
}
ALOGD("hyphenated string: %s", hyphenatedString.c_str());
#endif
// measure hyphenated substrings
for (size_t j = lastBreak; j < wordEnd; j++) {
uint8_t hyph = mHyphBuf[j - lastBreak];
if (hyph) {
paint->hyphenEdit = hyph;
layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
paint->hyphenEdit = 0;
layout.doLayout(mTextBuf.data(), j, wordEnd - j,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
}
}
// TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
// we can pass the whole word down to Hyphenator like the soft hyphen case.
bool wordEndsInHyphen = isLineBreakingHyphen(c);
size_t wordStart = mWordBreaker.wordStart();
size_t wordEnd = mWordBreaker.wordEnd();
if (paint != nullptr && mHyphenator != nullptr &&
mHyphenationFrequency != kHyphenationFrequency_None &&
!wordEndsInHyphen && !temporarilySkipHyphenation &&
wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
#if VERBOSE_DEBUG
std::string hyphenatedString;
for (size_t j = wordStart; j < wordEnd; j++) {
if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
// Note: only works with ASCII, should do UTF-8 conversion here
hyphenatedString.push_back(buffer()[j]);
}
// Skip hyphenating the next word if and only if the present word ends in a hyphen
temporarilySkipHyphenation = wordEndsInHyphen;
ALOGD("hyphenated string: %s", hyphenatedString.c_str());
#endif
// Skip break for zero-width characters inside replacement span
if (paint != nullptr || current == end || mCharWidths[current] > 0) {
addWordBreak(current, mWidth, postBreak, 0.0, 0);
// measure hyphenated substrings
for (size_t j = wordStart; j < wordEnd; j++) {
uint8_t hyph = mHyphBuf[j - wordStart];
if (hyph) {
paint->hyphenEdit = hyph;
layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
paint->hyphenEdit = 0;
layout.doLayout(mTextBuf.data(), j, afterWord - j,
mTextBuf.size(), bidiFlags, style, *paint);
ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
}
}
lastBreak = current;
lastBreakWidth = mWidth;
}
current = (size_t)mBreakIterator->next();
// Skip hyphenating the next word if and only if the present word ends in a hyphen
temporarilySkipHyphenation = wordEndsInHyphen;
// Skip break for zero-width characters inside replacement span
if (paint != nullptr || current == end || mCharWidths[current] > 0) {
addWordBreak(current, mWidth, postBreak, 0.0, 0);
}
lastBreak = current;
lastBreakWidth = mWidth;
current = (size_t)mWordBreaker.next();
}
}
......@@ -425,6 +415,7 @@ size_t LineBreaker::computeBreaks() {
}
void LineBreaker::finish() {
mWordBreaker.finish();
mWidth = 0;
mCandidates.clear();
mBreaks.clear();
......
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define LOG_TAG "Minikin"
#include <cutils/log.h>
#include "minikin/WordBreaker.h"
#include <unicode/uchar.h>
#include <unicode/utf16.h>
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
// TODO: handle failure status
if (mText != nullptr) {
mBreakIterator->setText(&mUText, status);
}
mIteratorWasReset = true;
}
void WordBreaker::setText(const uint16_t* data, size_t size) {
mText = data;
mTextSize = size;
mIteratorWasReset = false;
mLast = 0;
mCurrent = 0;
UErrorCode status = U_ZERO_ERROR;
utext_openUChars(&mUText, data, size, &status);
mBreakIterator->setText(&mUText, status);
mBreakIterator->first();
}
ssize_t WordBreaker::current() const {
return mCurrent;
}
ssize_t WordBreaker::next() {
int32_t result;
mLast = mCurrent;
do {
if (mIteratorWasReset) {
result = mBreakIterator->following(mCurrent);
mIteratorWasReset = false;
} else {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
&& mText[result - 1] == CHAR_SOFT_HYPHEN);
mCurrent = (ssize_t)result;
return mCurrent;
}
ssize_t WordBreaker::wordStart() const {
ssize_t result = mLast;
while (result < mCurrent) {
UChar32 c;
ssize_t ix = result;
U16_NEXT(mText, ix, mCurrent, c);
int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
// strip leading punctuation, defined as OP and QU line breaking classes,
// see UAX #14
if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
break;
}
result = ix;
}
return result;
}
ssize_t WordBreaker::wordEnd() const {
ssize_t result = mCurrent;
while (result > mLast) {
UChar32 c;
ssize_t ix = result;
U16_PREV(mText, mLast, ix, c);
int32_t gc_mask = U_GET_GC_MASK(c);
// strip trailing space and punctuation
if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
break;
}
result = ix;
}
return result;
}
void WordBreaker::finish() {
mText = nullptr;
// Note: calling utext_close multiply is safe
utext_close(&mUText);
}
} // namespace android
......@@ -79,7 +79,8 @@ LOCAL_SRC_FILES += \
MinikinFontForTest.cpp \
GraphemeBreakTests.cpp \
LayoutUtilsTest.cpp \
UnicodeUtils.cpp
UnicodeUtils.cpp \
WordBreakerTests.cpp
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/../libs/minikin/ \
......
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include "ICUTestBase.h"
#include "UnicodeUtils.h"
#include <minikin/WordBreaker.h>
#include <unicode/locid.h>
#include <unicode/uclean.h>
#include <unicode/udata.h>
#define LOG_TAG "Minikin"
#include <cutils/log.h>
#ifndef NELEM
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
#endif
using namespace android;
typedef ICUTestBase WordBreakerTest;
TEST_F(WordBreakerTest, basic) {
uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(6, breaker.next()); // after "hello "
EXPECT_EQ(0, breaker.wordStart()); // "hello"
EXPECT_EQ(5, breaker.wordEnd());
EXPECT_EQ(6, breaker.current());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(6, breaker.wordStart()); // "world"
EXPECT_EQ(11, breaker.wordEnd());
EXPECT_EQ(11, breaker.current());
}
TEST_F(WordBreakerTest, softHyphen) {
uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo "
EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
EXPECT_EQ(6, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(7, breaker.wordStart()); // "world"
EXPECT_EQ(12, breaker.wordEnd());
}
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(9, breaker.next()); // after "¡¡hello, "
EXPECT_EQ(2, breaker.wordStart()); // "hello"
EXPECT_EQ(7, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(9, breaker.wordStart()); // "world"
EXPECT_EQ(14, breaker.wordEnd());
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册