WordBreakerTests.cpp 14.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include "ICUTestBase.h"
#include "UnicodeUtils.h"
#include <minikin/WordBreaker.h>
#include <unicode/locid.h>
#include <unicode/uclean.h>
#include <unicode/udata.h>

#define LOG_TAG "Minikin"
#include <cutils/log.h>

#ifndef NELEM
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
#endif

32 33
#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)

34 35 36 37 38 39 40 41 42 43 44 45 46
using namespace android;

typedef ICUTestBase WordBreakerTest;

TEST_F(WordBreakerTest, basic) {
    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(6, breaker.next());  // after "hello "
    EXPECT_EQ(0, breaker.wordStart());  // "hello"
    EXPECT_EQ(5, breaker.wordEnd());
47
    EXPECT_EQ(0, breaker.breakBadness());
48 49 50 51
    EXPECT_EQ(6, breaker.current());
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(6, breaker.wordStart());  // "world"
    EXPECT_EQ(11, breaker.wordEnd());
52
    EXPECT_EQ(0, breaker.breakBadness());
53 54 55 56 57 58 59 60 61 62 63 64
    EXPECT_EQ(11, breaker.current());
}

TEST_F(WordBreakerTest, softHyphen) {
    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
    EXPECT_EQ(6, breaker.wordEnd());
65
    EXPECT_EQ(0, breaker.breakBadness());
66 67 68
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(7, breaker.wordStart());  // "world"
    EXPECT_EQ(12, breaker.wordEnd());
69
    EXPECT_EQ(0, breaker.breakBadness());
70 71
}

72 73 74
TEST_F(WordBreakerTest, zwjEmojiSequences) {
    uint16_t buf[] = {
        // man + zwj + heart + zwj + man
75 76 77
        UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
        // woman + zwj + heart + zwj + kiss mark + zwj + woman
        UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
78
        // eye + zwj + left speech bubble
79
        UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
    };
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
    EXPECT_EQ(0, breaker.wordStart());
    EXPECT_EQ(7, breaker.wordEnd());
    EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
    EXPECT_EQ(7, breaker.wordStart());
    EXPECT_EQ(17, breaker.wordEnd());
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(17, breaker.wordStart());
    EXPECT_EQ(22, breaker.wordEnd());
}

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
TEST_F(WordBreakerTest, emojiWithModifier) {
    uint16_t buf[] = {
        UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
        0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
    };
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(4, breaker.next());  // after man + type 6 fitzpatrick modifier
    EXPECT_EQ(0, breaker.wordStart());
    EXPECT_EQ(4, breaker.wordEnd());
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(4, breaker.wordStart());
    EXPECT_EQ(8, breaker.wordEnd());
}

113 114 115 116 117 118 119 120 121 122
TEST_F(WordBreakerTest, punct) {
    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
        '!', '!'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
    EXPECT_EQ(2, breaker.wordStart());  // "hello"
    EXPECT_EQ(7, breaker.wordEnd());
123
    EXPECT_EQ(0, breaker.breakBadness());
124 125 126
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(9, breaker.wordStart());  // "world"
    EXPECT_EQ(14, breaker.wordEnd());
127
    EXPECT_EQ(0, breaker.breakBadness());
128
}
129 130 131 132 133 134 135 136

TEST_F(WordBreakerTest, email) {
    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
        ' ', 'x'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
137 138
    EXPECT_EQ(11, breaker.next());  // after "foo@example"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
139
    EXPECT_EQ(1, breaker.breakBadness());
140
    EXPECT_EQ(16, breaker.next());  // after ".com "
141
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
142
    EXPECT_EQ(0, breaker.breakBadness());
143 144 145
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(16, breaker.wordStart());  // "x"
    EXPECT_EQ(17, breaker.wordEnd());
146
    EXPECT_EQ(0, breaker.breakBadness());
147 148 149 150 151 152 153 154 155
}

TEST_F(WordBreakerTest, mailto) {
    uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
        'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
156 157
    EXPECT_EQ(7, breaker.next());  // after "mailto:"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
158
    EXPECT_EQ(1, breaker.breakBadness());
159 160
    EXPECT_EQ(18, breaker.next());  // after "foo@example"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
161
    EXPECT_EQ(1, breaker.breakBadness());
162
    EXPECT_EQ(23, breaker.next());  // after ".com "
163
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
164
    EXPECT_EQ(0, breaker.breakBadness());
165 166 167
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(23, breaker.wordStart());  // "x"
    EXPECT_EQ(24, breaker.wordEnd());
168
    EXPECT_EQ(0, breaker.breakBadness());
169 170
}

171 172
// The current logic always places a line break after a detected email address or URL
// and an immediately following non-ASCII character.
173 174 175 176 177 178 179
TEST_F(WordBreakerTest, emailNonAscii) {
    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
        0x4E00};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
180 181
    EXPECT_EQ(11, breaker.next());  // after "foo@example"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
182
    EXPECT_EQ(1, breaker.breakBadness());
183
    EXPECT_EQ(15, breaker.next());  // after ".com"
184
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
185
    EXPECT_EQ(0, breaker.breakBadness());
186 187 188
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(15, breaker.wordStart());  // "一"
    EXPECT_EQ(16, breaker.wordEnd());
189
    EXPECT_EQ(0, breaker.breakBadness());
190 191 192 193 194 195 196 197 198
}

TEST_F(WordBreakerTest, emailCombining) {
    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
        0x0303, ' ', 'x'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
199 200
    EXPECT_EQ(11, breaker.next());  // after "foo@example"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
201
    EXPECT_EQ(1, breaker.breakBadness());
202
    EXPECT_EQ(17, breaker.next());  // after ".com̃ "
203
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
204
    EXPECT_EQ(0, breaker.breakBadness());
205 206 207
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(17, breaker.wordStart());  // "x"
    EXPECT_EQ(18, breaker.wordEnd());
208
    EXPECT_EQ(0, breaker.breakBadness());
209 210
}

211 212 213 214 215 216 217 218 219
TEST_F(WordBreakerTest, lonelyAt) {
    uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(2, breaker.next());  // after "a "
    EXPECT_EQ(0, breaker.wordStart());  // "a"
    EXPECT_EQ(1, breaker.wordEnd());
220
    EXPECT_EQ(0, breaker.breakBadness());
221 222
    EXPECT_EQ(4, breaker.next());  // after "@ "
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
223
    EXPECT_EQ(0, breaker.breakBadness());
224 225 226
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(4, breaker.wordStart());  // "b"
    EXPECT_EQ(5, breaker.wordEnd());
227
    EXPECT_EQ(0, breaker.breakBadness());
228 229
}

230 231 232 233 234 235 236
TEST_F(WordBreakerTest, url) {
    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
        '.', 'c', 'o', 'm', ' ', 'x'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
237 238
    EXPECT_EQ(5, breaker.next());  // after "http:"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
239
    EXPECT_EQ(1, breaker.breakBadness());
240 241
    EXPECT_EQ(7, breaker.next());  // after "//"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
242
    EXPECT_EQ(1, breaker.breakBadness());
243 244
    EXPECT_EQ(14, breaker.next());  // after "example"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
245
    EXPECT_EQ(1, breaker.breakBadness());
246
    EXPECT_EQ(19, breaker.next());  // after ".com "
247
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
248
    EXPECT_EQ(0, breaker.breakBadness());
249 250 251
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_EQ(19, breaker.wordStart());  // "x"
    EXPECT_EQ(20, breaker.wordEnd());
252
    EXPECT_EQ(0, breaker.breakBadness());
253
}
254 255 256 257 258 259 260 261 262 263 264

// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST_F(WordBreakerTest, urlBreakChars) {
    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
        '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(5, breaker.next());  // after "http:"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
265
    EXPECT_EQ(1, breaker.breakBadness());
266 267
    EXPECT_EQ(7, breaker.next());  // after "//"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
268
    EXPECT_EQ(1, breaker.breakBadness());
269 270
    EXPECT_EQ(8, breaker.next());  // after "a"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
271
    EXPECT_EQ(1, breaker.breakBadness());
272 273
    EXPECT_EQ(10, breaker.next());  // after ".b"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
274
    EXPECT_EQ(1, breaker.breakBadness());
275 276
    EXPECT_EQ(11, breaker.next());  // after "/"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
277
    EXPECT_EQ(1, breaker.breakBadness());
278 279
    EXPECT_EQ(13, breaker.next());  // after "~c"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
280
    EXPECT_EQ(1, breaker.breakBadness());
281 282
    EXPECT_EQ(15, breaker.next());  // after ",d"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
283
    EXPECT_EQ(1, breaker.breakBadness());
284 285
    EXPECT_EQ(17, breaker.next());  // after "-e"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
286
    EXPECT_EQ(1, breaker.breakBadness());
287 288
    EXPECT_EQ(19, breaker.next());  // after "?f"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
289
    EXPECT_EQ(1, breaker.breakBadness());
290 291
    EXPECT_EQ(20, breaker.next());  // after "="
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
292
    EXPECT_EQ(1, breaker.breakBadness());
293 294
    EXPECT_EQ(21, breaker.next());  // after "g"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
295
    EXPECT_EQ(1, breaker.breakBadness());
296 297
    EXPECT_EQ(22, breaker.next());  // after "&"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
298
    EXPECT_EQ(1, breaker.breakBadness());
299 300
    EXPECT_EQ(23, breaker.next());  // after "h"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
301
    EXPECT_EQ(1, breaker.breakBadness());
302 303
    EXPECT_EQ(25, breaker.next());  // after "#i"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
304
    EXPECT_EQ(1, breaker.breakBadness());
305 306
    EXPECT_EQ(27, breaker.next());  // after "%j"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
307
    EXPECT_EQ(1, breaker.breakBadness());
308 309
    EXPECT_EQ(29, breaker.next());  // after "_k"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
310
    EXPECT_EQ(1, breaker.breakBadness());
311 312
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
313
    EXPECT_EQ(0, breaker.breakBadness());
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
}

TEST_F(WordBreakerTest, urlNoHyphenBreak) {
    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(5, breaker.next());  // after "http:"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    EXPECT_EQ(7, breaker.next());  // after "//"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    EXPECT_EQ(8, breaker.next());  // after "a"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}

TEST_F(WordBreakerTest, urlEndsWithSlash) {
    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ(5, breaker.next());  // after "http:"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    EXPECT_EQ(7, breaker.next());  // after "//"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    EXPECT_EQ(8, breaker.next());  // after "a"
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}

TEST_F(WordBreakerTest, emailStartsWithSlash) {
    uint16_t buf[] = {'/', 'a', '@', 'b'};
    WordBreaker breaker;
    breaker.setLocale(icu::Locale::getEnglish());
    breaker.setText(buf, NELEM(buf));
    EXPECT_EQ(0, breaker.current());
    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}