Refine hyphenation around punctuation

Implement a WordBreaker that defines our concept of valid word boundaries, customizing the ICU behavior. Currently, we suppress line breaks at soft hyphens (these are handled specially). Also, the new WordBreaker class has methods that determine the start and end of the word (punctuation stripped) for the purpose of hyphenation. This patch, in its current form, doesn't handle email addresses and URLs specially, but the WordBreaker class is the correct place to do so. Also, special case handling of hyphens and dashes is still done in LineBreaker, but all of that should be moved to WordBreaker. Bug: 20126487 Bug: 20566159 Change-Id: I492cbad963f9b74a2915f010dad46bb91f97b2fe

Refine hyphenation around punctuation
Implement a WordBreaker that defines our concept of valid word boundaries, customizing the ICU behavior. Currently, we suppress line breaks at soft hyphens (these are handled specially). Also, the new WordBreaker class has methods that determine the start and end of the word (punctuation stripped) for the purpose of hyphenation. This patch, in its current form, doesn't handle email addresses and URLs specially, but the WordBreaker class is the correct place to do so. Also, special case handling of hyphens and dashes is still done in LineBreaker, but all of that should be moved to WordBreaker. Bug: 20126487 Bug: 20566159 Change-Id: I492cbad963f9b74a2915f010dad46bb91f97b2fe
57b6dae9 · Raph Levien · 070633ad · 57b6dae9 · 57b6dae9 · 57b6dae9
7 changed file
--- a/include/minikin/LineBreaker.h
+++ b/include/minikin/LineBreaker.h
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <vector>
 #include "minikin/Hyphenator.h"
+#include "minikin/WordBreaker.h"

 namespace android {

@@ -102,11 +103,6 @@ class LineBreaker {
    public:
        const static int kTab_Shift = 29;  // keep synchronized with TAB_MASK in StaticLayout.java

-        ~LineBreaker() {
-            utext_close(&mUText);
-            delete mBreakIterator;
-        }
-
        // Note: Locale persists across multiple invocations (it is not cleaned up by finish()),
        // explicitly to avoid the cost of creating ICU BreakIterator objects. It should always
        // be set on the first invocation, but callers are encouraged not to call again unless
@@ -214,8 +210,7 @@ class LineBreaker {

        void finishBreaksOptimal();

-        icu::BreakIterator* mBreakIterator = nullptr;
-        UText mUText = UTEXT_INITIALIZER;
+        WordBreaker mWordBreaker;
        std::vector<uint16_t>mTextBuf;
        std::vector<float>mCharWidths;


--- a/include/minikin/WordBreaker.h
+++ b/include/minikin/WordBreaker.h
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A wrapper around ICU's line break iterator, that gives customized line
+ * break opportunities, as well as identifying words for the purpose of
+ * hyphenation.
+ */
+
+#ifndef MINIKIN_WORD_BREAKER_H
+#define MINIKIN_WORD_BREAKER_H
+
+#include "unicode/brkiter.h"
+#include <memory>
+
+namespace android {
+
+class WordBreaker {
+public:
+    ~WordBreaker() {
+        finish();
+    }
+
+    void setLocale(const icu::Locale& locale);
+
+    void setText(const uint16_t* data, size_t size);
+
+    // Advance iterator to next word break. Return offset, or -1 if EOT
+    ssize_t next();
+
+    // Current offset of iterator, equal to 0 at BOT or last return from next()
+    ssize_t current() const;
+
+    // After calling next(), wordStart() and wordEnd() are offsets defining the previous
+    // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
+    ssize_t wordStart() const;
+
+    ssize_t wordEnd() const;
+
+    void finish();
+
+private:
+    std::unique_ptr<icu::BreakIterator> mBreakIterator;
+    UText mUText = UTEXT_INITIALIZER;
+    const uint16_t* mText = nullptr;
+    size_t mTextSize;
+    ssize_t mLast;
+    ssize_t mCurrent;
+    bool mIteratorWasReset;
+};
+
+}  // namespace
+
+#endif  // MINIKIN_WORD_BREAKER_H
--- a/libs/minikin/Android.mk
+++ b/libs/minikin/Android.mk
@@ -33,7 +33,8 @@ minikin_src_files := \
    MinikinInternal.cpp \
    MinikinRefCounted.cpp \
    MinikinFontFreeType.cpp \
-    SparseBitSet.cpp
+    SparseBitSet.cpp \
+    WordBreaker.cpp

 minikin_c_includes := \
    external/harfbuzz_ng/src \

--- a/libs/minikin/LineBreaker.cpp
+++ b/libs/minikin/LineBreaker.cpp
@@ -29,7 +29,6 @@ using std::vector;
 namespace android {

 const int CHAR_TAB = 0x0009;
-const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;

 // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
 // constants are larger than any reasonable actual width score.
@@ -55,23 +54,16 @@ const size_t LONGEST_HYPHENATED_WORD = 45;
 const size_t MAX_TEXT_BUF_RETAIN = 32678;

 void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
-    delete mBreakIterator;
-    UErrorCode status = U_ZERO_ERROR;
-    mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
-    // TODO: check status
+    mWordBreaker.setLocale(locale);

-    // TODO: load actual resource dependent on locale; letting Minikin do it is a hack
    mHyphenator = hyphenator;
 }

 void LineBreaker::setText() {
-    UErrorCode status = U_ZERO_ERROR;
-    utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
-    mBreakIterator->setText(&mUText, status);
-    mBreakIterator->first();
+    mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());

    // handle initial break here because addStyleRun may never be called
-    mBreakIterator->next();
+    mWordBreaker.next();
    mCandidates.clear();
    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
    mCandidates.push_back(cand);
@@ -151,8 +143,8 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
        mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
    }

-    size_t current = (size_t)mBreakIterator->current();
-    size_t wordEnd = start;
+    size_t current = (size_t)mWordBreaker.current();
+    size_t afterWord = start;
    size_t lastBreak = start;
    ParaWidth lastBreakWidth = mWidth;
    ParaWidth postBreak = mWidth;
@@ -170,58 +162,56 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
            mWidth += mCharWidths[i];
            if (!isLineEndSpace(c)) {
                postBreak = mWidth;
-                wordEnd = i + 1;
+                afterWord = i + 1;
            }
        }
        if (i + 1 == current) {
-            // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
-            // to be a hyphen break, with penalty and drawing behavior.
-            if (c != CHAR_SOFT_HYPHEN) {
-                // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
-                // we can pass the whole word down to Hyphenator like the soft hyphen case.
-                bool wordEndsInHyphen = isLineBreakingHyphen(c);
-                if (paint != nullptr && mHyphenator != nullptr &&
-                        mHyphenationFrequency != kHyphenationFrequency_None &&
-                        !wordEndsInHyphen && !temporarilySkipHyphenation &&
-                        wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
-                    mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
-    #if VERBOSE_DEBUG
-                    std::string hyphenatedString;
-                    for (size_t j = lastBreak; j < wordEnd; j++) {
-                        if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
-                        // Note: only works with ASCII, should do UTF-8 conversion here
-                        hyphenatedString.push_back(buffer()[j]);
-                    }
-                    ALOGD("hyphenated string: %s", hyphenatedString.c_str());
-    #endif
-
-                    // measure hyphenated substrings
-                    for (size_t j = lastBreak; j < wordEnd; j++) {
-                        uint8_t hyph = mHyphBuf[j - lastBreak];
-                        if (hyph) {
-                            paint->hyphenEdit = hyph;
-                            layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
-                                    mTextBuf.size(), bidiFlags, style, *paint);
-                            ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
-                            paint->hyphenEdit = 0;
-                            layout.doLayout(mTextBuf.data(), j, wordEnd - j,
-                                    mTextBuf.size(), bidiFlags, style, *paint);
-                            ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
-                            addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
-                        }
-                    }
+            // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
+            // we can pass the whole word down to Hyphenator like the soft hyphen case.
+            bool wordEndsInHyphen = isLineBreakingHyphen(c);
+            size_t wordStart = mWordBreaker.wordStart();
+            size_t wordEnd = mWordBreaker.wordEnd();
+            if (paint != nullptr && mHyphenator != nullptr &&
+                    mHyphenationFrequency != kHyphenationFrequency_None &&
+                    !wordEndsInHyphen && !temporarilySkipHyphenation &&
+                    wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
+                mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
+#if VERBOSE_DEBUG
+                std::string hyphenatedString;
+                for (size_t j = wordStart; j < wordEnd; j++) {
+                    if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
+                    // Note: only works with ASCII, should do UTF-8 conversion here
+                    hyphenatedString.push_back(buffer()[j]);
                }
-                // Skip hyphenating the next word if and only if the present word ends in a hyphen
-                temporarilySkipHyphenation = wordEndsInHyphen;
+                ALOGD("hyphenated string: %s", hyphenatedString.c_str());
+#endif

-                // Skip break for zero-width characters inside replacement span
-                if (paint != nullptr || current == end || mCharWidths[current] > 0) {
-                    addWordBreak(current, mWidth, postBreak, 0.0, 0);
+                // measure hyphenated substrings
+                for (size_t j = wordStart; j < wordEnd; j++) {
+                    uint8_t hyph = mHyphBuf[j - wordStart];
+                    if (hyph) {
+                        paint->hyphenEdit = hyph;
+                        layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
+                                mTextBuf.size(), bidiFlags, style, *paint);
+                        ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
+                        paint->hyphenEdit = 0;
+                        layout.doLayout(mTextBuf.data(), j, afterWord - j,
+                                mTextBuf.size(), bidiFlags, style, *paint);
+                        ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
+                        addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
+                    }
                }
-                lastBreak = current;
-                lastBreakWidth = mWidth;
            }
-            current = (size_t)mBreakIterator->next();
+            // Skip hyphenating the next word if and only if the present word ends in a hyphen
+            temporarilySkipHyphenation = wordEndsInHyphen;
+
+            // Skip break for zero-width characters inside replacement span
+            if (paint != nullptr || current == end || mCharWidths[current] > 0) {
+                addWordBreak(current, mWidth, postBreak, 0.0, 0);
+            }
+            lastBreak = current;
+            lastBreakWidth = mWidth;
+            current = (size_t)mWordBreaker.next();
        }
    }

@@ -425,6 +415,7 @@ size_t LineBreaker::computeBreaks() {
 }

 void LineBreaker::finish() {
+    mWordBreaker.finish();
    mWidth = 0;
    mCandidates.clear();
    mBreaks.clear();

--- a/libs/minikin/WordBreaker.cpp
+++ b/libs/minikin/WordBreaker.cpp
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "Minikin"
+#include <cutils/log.h>
+
+#include "minikin/WordBreaker.h"
+
+#include <unicode/uchar.h>
+#include <unicode/utf16.h>
+
+namespace android {
+
+const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
+
+void WordBreaker::setLocale(const icu::Locale& locale) {
+    UErrorCode status = U_ZERO_ERROR;
+    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
+    // TODO: handle failure status
+    if (mText != nullptr) {
+        mBreakIterator->setText(&mUText, status);
+    }
+    mIteratorWasReset = true;
+}
+
+void WordBreaker::setText(const uint16_t* data, size_t size) {
+    mText = data;
+    mTextSize = size;
+    mIteratorWasReset = false;
+    mLast = 0;
+    mCurrent = 0;
+    UErrorCode status = U_ZERO_ERROR;
+    utext_openUChars(&mUText, data, size, &status);
+    mBreakIterator->setText(&mUText, status);
+    mBreakIterator->first();
+}
+
+ssize_t WordBreaker::current() const {
+    return mCurrent;
+}
+
+ssize_t WordBreaker::next() {
+    int32_t result;
+    mLast = mCurrent;
+    do {
+        if (mIteratorWasReset) {
+            result = mBreakIterator->following(mCurrent);
+            mIteratorWasReset = false;
+        } else {
+            result = mBreakIterator->next();
+        }
+    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
+             && mText[result - 1] == CHAR_SOFT_HYPHEN);
+    mCurrent = (ssize_t)result;
+    return mCurrent;
+}
+
+ssize_t WordBreaker::wordStart() const {
+    ssize_t result = mLast;
+    while (result < mCurrent) {
+        UChar32 c;
+        ssize_t ix = result;
+        U16_NEXT(mText, ix, mCurrent, c);
+        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
+        // strip leading punctuation, defined as OP and QU line breaking classes,
+        // see UAX #14
+        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
+            break;
+        }
+        result = ix;
+    }
+    return result;
+}
+
+ssize_t WordBreaker::wordEnd() const {
+    ssize_t result = mCurrent;
+    while (result > mLast) {
+        UChar32 c;
+        ssize_t ix = result;
+        U16_PREV(mText, mLast, ix, c);
+        int32_t gc_mask = U_GET_GC_MASK(c);
+        // strip trailing space and punctuation
+        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
+            break;
+        }
+        result = ix;
+    }
+    return result;
+}
+
+void WordBreaker::finish() {
+    mText = nullptr;
+    // Note: calling utext_close multiply is safe
+    utext_close(&mUText);
+}
+
+}  // namespace android
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -79,7 +79,8 @@ LOCAL_SRC_FILES += \
    MinikinFontForTest.cpp \
    GraphemeBreakTests.cpp \
    LayoutUtilsTest.cpp \
-    UnicodeUtils.cpp
+    UnicodeUtils.cpp \
+    WordBreakerTests.cpp

 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/../libs/minikin/ \

--- a/tests/WordBreakerTests.cpp
+++ b/tests/WordBreakerTests.cpp
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "ICUTestBase.h"
+#include "UnicodeUtils.h"
+#include <minikin/WordBreaker.h>
+#include <unicode/locid.h>
+#include <unicode/uclean.h>
+#include <unicode/udata.h>
+
+#define LOG_TAG "Minikin"
+#include <cutils/log.h>
+
+#ifndef NELEM
+#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
+#endif
+
+using namespace android;
+
+typedef ICUTestBase WordBreakerTest;
+
+TEST_F(WordBreakerTest, basic) {
+    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(6, breaker.next());  // after "hello "
+    EXPECT_EQ(0, breaker.wordStart());  // "hello"
+    EXPECT_EQ(5, breaker.wordEnd());
+    EXPECT_EQ(6, breaker.current());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(6, breaker.wordStart());  // "world"
+    EXPECT_EQ(11, breaker.wordEnd());
+    EXPECT_EQ(11, breaker.current());
+}
+
+TEST_F(WordBreakerTest, softHyphen) {
+    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
+    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
+    EXPECT_EQ(6, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(7, breaker.wordStart());  // "world"
+    EXPECT_EQ(12, breaker.wordEnd());
+}
+
+TEST_F(WordBreakerTest, punct) {
+    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
+        '!', '!'};
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
+    EXPECT_EQ(2, breaker.wordStart());  // "hello"
+    EXPECT_EQ(7, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(9, breaker.wordStart());  // "world"
+    EXPECT_EQ(14, breaker.wordEnd());
+}