From 378d279bbf692195c4654e312dae854ab3be04cf Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Tue, 31 Jul 2012 21:36:16 -0400 Subject: [PATCH] Implement Unicode compatibility decompositions Based on patch from Philip Withnall. https://bugs.freedesktop.org/show_bug.cgi?id=41095 --- src/hb-glib.cc | 30 ++++++++++++++++++++ src/hb-icu.cc | 36 +++++++++++++++++++++++- src/hb-ot-shape-normalize.cc | 53 ++++++++++++++++++++++++------------ src/hb-unicode-private.hh | 1 + src/hb-unicode.cc | 27 +++++++++++++++++- src/hb-unicode.h | 37 ++++++++++++++++++++++++- test/api/hb-test.h | 1 + test/api/test-unicode.c | 50 ++++++++++++++++++++++++++++++++++ 8 files changed, 214 insertions(+), 21 deletions(-) diff --git a/src/hb-glib.cc b/src/hb-glib.cc index 6b655dd1..52463636 100644 --- a/src/hb-glib.cc +++ b/src/hb-glib.cc @@ -336,6 +336,36 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_glib_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t u, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ +#if GLIB_CHECK_VERSION(2,29,12) + return g_unichar_fully_decompose (u, TRUE, decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN); +#endif + + /* If the user doesn't have GLib >= 2.29.12 we have to perform + * a round trip to UTF-8 and the associated memory management dance. */ + gchar utf8[6]; + gchar *utf8_decomposed, *c; + gsize utf8_len, utf8_decomposed_len, i; + + /* Convert @u to UTF-8 and normalise it in NFKD mode. This performs the compatibility decomposition. */ + utf8_len = g_unichar_to_utf8 (u, utf8); + utf8_decomposed = g_utf8_normalize (utf8, utf8_len, G_NORMALIZE_NFKD); + utf8_decomposed_len = g_utf8_strlen (utf8_decomposed, -1); + + assert (utf8_decomposed_len <= HB_UNICODE_MAX_DECOMPOSITION_LEN); + + for (i = 0, c = utf8_decomposed; i < utf8_decomposed_len; i++, c = g_utf8_next_char (c)) + *decomposed++ = g_utf8_get_char (c); + + g_free (utf8_decomposed); + + return utf8_decomposed_len; +} extern HB_INTERNAL const hb_unicode_funcs_t _hb_glib_unicode_funcs; const hb_unicode_funcs_t _hb_glib_unicode_funcs = { diff --git a/src/hb-icu.cc b/src/hb-icu.cc index 491c1c87..dce6103c 100644 --- a/src/hb-icu.cc +++ b/src/hb-icu.cc @@ -207,7 +207,7 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, hb_codepoint_t *b, void *user_data HB_UNUSED) { - UChar utf16[2], normalized[20]; + UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; int len; hb_bool_t ret, err; UErrorCode icu_err; @@ -271,6 +271,40 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, return ret; } +static unsigned int +hb_icu_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t u, + hb_codepoint_t *decomposed, + void *user_data HB_UNUSED) +{ + UChar utf16[2], normalized[2 * HB_UNICODE_MAX_DECOMPOSITION_LEN + 1]; + gint len; + int32_t utf32_len; + hb_bool_t err; + UErrorCode icu_err; + + /* Copy @u into a UTF-16 array to be passed to ICU. */ + len = 0; + err = FALSE; + U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), u, err); + if (err) + return 0; + + /* Normalise the codepoint using NFKD mode. */ + icu_err = U_ZERO_ERROR; + len = unorm_normalize (utf16, len, UNORM_NFKD, 0, normalized, ARRAY_LENGTH (normalized), &icu_err); + if (icu_err) + return 0; + + /* Convert the decomposed form from UTF-16 to UTF-32. */ + icu_err = U_ZERO_ERROR; + u_strToUTF32 ((UChar32*) decomposed, HB_UNICODE_MAX_DECOMPOSITION_LEN, &utf32_len, normalized, len, &icu_err); + if (icu_err) + return 0; + + return utf32_len; +} + extern HB_INTERNAL const hb_unicode_funcs_t _hb_icu_unicode_funcs; const hb_unicode_funcs_t _hb_icu_unicode_funcs = { diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index d4b0b279..46c89ec1 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -62,7 +62,8 @@ * knowledge too. We need to provide assistance to the itemizer. * * - When a font does not support a character but supports its decomposition, - * well, use the decomposition. + * well, use the decomposition (preferring the canonical decomposition, but + * falling back to the compatibility decomposition if necessary). * * - The Indic shaper requests decomposed output. This will handle splitting * matra for the Indic shaper. @@ -111,29 +112,45 @@ decompose (hb_font_t *font, hb_buffer_t *buffer, return false; } -static void -decompose_current_glyph (hb_font_t *font, hb_buffer_t *buffer, - bool shortest) +static bool +decompose_compatibility (hb_font_t *font, hb_buffer_t *buffer, + hb_codepoint_t u) { - if (decompose (font, buffer, shortest, buffer->cur().codepoint)) - buffer->skip_glyph (); - else - buffer->next_glyph (); + unsigned int len, i; + hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN]; + + len = hb_unicode_decompose_compatibility (buffer->unicode, u, decomposed); + if (!len) + return false; + + hb_codepoint_t glyph; + for (i = 0; i < len; i++) + if (!hb_font_get_glyph (font, decomposed[i], 0, &glyph)) + return false; + + for (i = 0; i < len; i++) + output_glyph (buffer, decomposed[i]); + + return true; } static void -decompose_single_char_cluster (hb_font_t *font, hb_buffer_t *buffer, - bool will_recompose) +decompose_current_character (hb_font_t *font, hb_buffer_t *buffer, + bool shortest) { hb_codepoint_t glyph; - /* If recomposing and font supports this, we're good to go */ - if (will_recompose && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) { + /* Kind of a cute waterfall here... */ + if (shortest && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) + buffer->next_glyph (); + else if (decompose (font, buffer, shortest, buffer->cur().codepoint)) + buffer->skip_glyph (); + else if (!shortest && hb_font_get_glyph (font, buffer->cur().codepoint, 0, &glyph)) + buffer->next_glyph (); + else if (decompose_compatibility (font, buffer, buffer->cur().codepoint)) + buffer->skip_glyph (); + else buffer->next_glyph (); - return; - } - - decompose_current_glyph (font, buffer, will_recompose); } static void @@ -149,7 +166,7 @@ decompose_multi_char_cluster (hb_font_t *font, hb_buffer_t *buffer, } while (buffer->idx < end) - decompose_current_glyph (font, buffer, false); + decompose_current_character (font, buffer, false); } static int @@ -188,7 +205,7 @@ _hb_ot_shape_normalize (hb_font_t *font, hb_buffer_t *buffer, break; if (buffer->idx + 1 == end) - decompose_single_char_cluster (font, buffer, recompose); + decompose_current_character (font, buffer, recompose); else { decompose_multi_char_cluster (font, buffer, end); has_multichar_clusters = true; diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh index 1ce5adc6..ba791eb7 100644 --- a/src/hb-unicode-private.hh +++ b/src/hb-unicode-private.hh @@ -50,6 +50,7 @@ HB_UNICODE_FUNC_IMPLEMENT (script) \ HB_UNICODE_FUNC_IMPLEMENT (compose) \ HB_UNICODE_FUNC_IMPLEMENT (decompose) \ + HB_UNICODE_FUNC_IMPLEMENT (decompose_compatibility) \ /* ^--- Add new callbacks here */ /* Simple callbacks are those taking a hb_codepoint_t and returning a hb_codepoint_t */ diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc index b05b290c..f300fedc 100644 --- a/src/hb-unicode.cc +++ b/src/hb-unicode.cc @@ -99,6 +99,15 @@ hb_unicode_decompose_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, } +static unsigned int +hb_unicode_decompose_compatibility_nil (hb_unicode_funcs_t *ufuncs HB_UNUSED, + hb_codepoint_t u HB_UNUSED, + hb_codepoint_t *decomposed HB_UNUSED, + void *user_data HB_UNUSED) +{ + return 0; +} + hb_unicode_funcs_t * hb_unicode_funcs_get_default (void) @@ -312,6 +321,23 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, return ufuncs->func.decompose (ufuncs, ab, a, b, ufuncs->user_data.decompose); } +unsigned int +hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t u, + hb_codepoint_t *decomposed) +{ + unsigned int ret = ufuncs->func.decompose_compatibility (ufuncs, u, + decomposed, + ufuncs->user_data.decompose_compatibility); + if (ret == 1 && u == decomposed[0]) { + decomposed[0] = 0; + return 0; + } + + decomposed[ret] = 0; + + return ret; +} unsigned int @@ -380,4 +406,3 @@ _hb_unicode_modified_combining_class (hb_unicode_funcs_t *ufuncs, return c; } - diff --git a/src/hb-unicode.h b/src/hb-unicode.h index 808c6e16..2af2d674 100644 --- a/src/hb-unicode.h +++ b/src/hb-unicode.h @@ -1,7 +1,7 @@ /* * Copyright © 2009 Red Hat, Inc. * Copyright © 2011 Codethink Limited - * Copyright © 2011 Google, Inc. + * Copyright © 2011,2012 Google, Inc. * * This is part of HarfBuzz, a text shaping library. * @@ -122,6 +122,32 @@ typedef hb_bool_t (*hb_unicode_decompose_func_t) (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *b, void *user_data); +/** + * hb_unicode_decompose_compatibility_func_t: + * @ufuncs: Unicode function structure + * @u: codepoint to decompose + * @decomposed: address of codepoint array (of length %HB_UNICODE_MAX_DECOMPOSITION_LEN) to write decomposition into + * @user_data: user data pointer as passed to hb_unicode_funcs_set_decompose_compatibility_func() + * + * Fully decompose @u to its Unicode compatibility decomposition. The codepoints of the decomposition will be written to @decomposed. + * The complete length of the decomposition will be returned. + * + * If @u has no compatibility decomposition, zero should be returned. + * + * The Unicode standard guarantees that a buffer of length %HB_UNICODE_MAX_DECOMPOSITION_LEN codepoints will always be sufficient for any + * compatibility decomposition plus an terminating value of 0. Consequently, @decompose must be allocated by the caller to be at least this length. Implementations + * of this function type must ensure that they do not write past the provided array. + * + * Return value: number of codepoints in the full compatibility decomposition of @u, or 0 if no decomposition available. + */ +typedef unsigned int (*hb_unicode_decompose_compatibility_func_t) (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t u, + hb_codepoint_t *decomposed, + void *user_data); + +/* See Unicode 6.1 for details on the maximum decomposition length. */ +#define HB_UNICODE_MAX_DECOMPOSITION_LEN (18+1) /* codepoints */ + /* setters */ void @@ -159,6 +185,10 @@ hb_unicode_funcs_set_decompose_func (hb_unicode_funcs_t *ufuncs, hb_unicode_decompose_func_t decompose_func, void *user_data, hb_destroy_func_t destroy); +void +hb_unicode_funcs_set_decompose_compatibility_func (hb_unicode_funcs_t *ufuncs, + hb_unicode_decompose_compatibility_func_t decompose_compatibility_func, + void *user_data, hb_destroy_func_t destroy); /* accessors */ @@ -193,6 +223,11 @@ hb_unicode_decompose (hb_unicode_funcs_t *ufuncs, hb_codepoint_t *a, hb_codepoint_t *b); +unsigned int +hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t u, + hb_codepoint_t *decomposed); + HB_END_DECLS #endif /* HB_UNICODE_H */ diff --git a/test/api/hb-test.h b/test/api/hb-test.h index d569757a..8655f41f 100644 --- a/test/api/hb-test.h +++ b/test/api/hb-test.h @@ -33,6 +33,7 @@ #include #include +#include HB_BEGIN_DECLS diff --git a/test/api/test-unicode.c b/test/api/test-unicode.c index a420bf3b..96c61dd0 100644 --- a/test/api/test-unicode.c +++ b/test/api/test-unicode.c @@ -786,6 +786,7 @@ test_unicode_normalization (gconstpointer user_data) { hb_unicode_funcs_t *uf = (hb_unicode_funcs_t *) user_data; gunichar a, b, ab; + hb_codepoint_t decomposed[HB_UNICODE_MAX_DECOMPOSITION_LEN]; /* Test compose() */ @@ -849,6 +850,55 @@ test_unicode_normalization (gconstpointer user_data) g_assert (hb_unicode_decompose (uf, 0xCE31, &a, &b) && a == 0xCE20 && b == 0x11B8); g_assert (hb_unicode_decompose (uf, 0xCE20, &a, &b) && a == 0x110E && b == 0x1173); + + /* Test decompose_compatibility() */ + + /* Not decomposable */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x0041, decomposed) == 0); + g_assert (hb_unicode_decompose_compatibility (uf, 0x1F632, decomposed) == 0); + + /* Singletons */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x00B5, decomposed) == 1 && decomposed[0] == 0x03BC); + g_assert (hb_unicode_decompose_compatibility (uf, 0x03D6, decomposed) == 1 && decomposed[0] == 0x03C0); + + /* Arabic compatibility */ + g_assert (hb_unicode_decompose_compatibility (uf, 0xFB54, decomposed) == 1 && decomposed[0] == 0x067B); + + /* Longest decomposition ever */ + g_assert (18 <= HB_UNICODE_MAX_DECOMPOSITION_LEN); + g_assert (hb_unicode_decompose_compatibility (uf, 0xFDFA, decomposed) == 18 && decomposed[17] == 0x0645); + + /* Note: we deliberately don't test characters that have canonical decompositions but no + * compatibility decomposition against the decompose_compatibility() function as that we + * leave up to implementations (for now). */ + + /* Spaces */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x2002, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2003, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2004, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2005, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2006, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2008, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2009, decomposed) == 1 && decomposed[0] == 0x0020); + g_assert (hb_unicode_decompose_compatibility (uf, 0x200A, decomposed) == 1 && decomposed[0] == 0x0020); + + /* Pairs */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x0587, decomposed) == 2 && + decomposed[0] == 0x0565 && decomposed[1] == 0x0582); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2017, decomposed) == 2 && + decomposed[0] == 0x0020 && decomposed[1] == 0x0333); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2025, decomposed) == 2 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2033, decomposed) == 2 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032); + + /* Triples */ + g_assert (hb_unicode_decompose_compatibility (uf, 0x2026, decomposed) == 3 && + decomposed[0] == 0x002E && decomposed[1] == 0x002E && decomposed[2] == 0x002E); + g_assert (hb_unicode_decompose_compatibility (uf, 0x2034, decomposed) == 3 && + decomposed[0] == 0x2032 && decomposed[1] == 0x2032 && decomposed[2] == 0x2032); + g_assert (hb_unicode_decompose_compatibility (uf, 0x213B, decomposed) == 3 && + decomposed[0] == 0x0046 && decomposed[1] == 0x0041 && decomposed[2] == 0x0058); } -- GitLab