diff --git a/CMakeLists.txt b/CMakeLists.txt index f7a5d83084b27722fe9acbe6cd0b5086613e4c96..660da5a1fbe917593e111275e69908d840341164 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -809,7 +809,7 @@ endif () ## src/ executables if (NOT HB_DISABLE_TEST_PROGS) - foreach (prog main test test-would-substitute test-size-params test-buffer-serialize hb-ot-tag) + foreach (prog main test test-would-substitute test-size-params test-buffer-serialize hb-ot-tag test-unicode-ranges) set (prog_name ${prog}) if (${prog_name} STREQUAL "test") # test can not be used as a valid executable name on cmake, lets special case it diff --git a/src/Makefile.am b/src/Makefile.am index 3f98e1db632096e32f1c557b57c3b960daf40adf..2871f30f40704dce6b99014807ea7c50b46d0cbb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -381,12 +381,16 @@ dump_use_data_SOURCES = dump-use-data.cc hb-ot-shape-complex-use-table.cc dump_use_data_CPPFLAGS = $(HBCFLAGS) dump_use_data_LDADD = libharfbuzz.la $(HBLIBS) -check_PROGRAMS += test-ot-tag -TESTS += test-ot-tag +check_PROGRAMS += test-ot-tag test-unicode-ranges +TESTS += test-ot-tag test-unicode-ranges + test_ot_tag_SOURCES = hb-ot-tag.cc test_ot_tag_CPPFLAGS = $(HBCFLAGS) -DMAIN test_ot_tag_LDADD = libharfbuzz.la $(HBLIBS) +test_unicode_ranges_SOURCES = test-unicode-ranges.cc +test_unicode_ranges_LDADD = libharfbuzz.la $(HBLIBS) + TESTS_ENVIRONMENT = \ srcdir="$(srcdir)" \ MAKE="$(MAKE) $(AM_MAKEFLAGS)" \ diff --git a/src/Makefile.sources b/src/Makefile.sources index 7883412ca983e6fb511b206a24495a20566c3fe1..e114a5013f069ce147973ea2d9c8444fbc3bcdea 100644 --- a/src/Makefile.sources +++ b/src/Makefile.sources @@ -28,6 +28,7 @@ HB_BASE_sources = \ hb-ot-maxp-table.hh \ hb-ot-name-table.hh \ hb-ot-os2-table.hh \ + hb-ot-os2-unicode-ranges.hh \ hb-ot-post-macroman.hh \ hb-ot-post-table.hh \ hb-ot-tag.cc \ diff --git a/src/gen-unicode-ranges.py b/src/gen-unicode-ranges.py new file mode 100644 index 0000000000000000000000000000000000000000..3b59cd862df181b74e1980815e272508861afc47 --- /dev/null +++ b/src/gen-unicode-ranges.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +# Generates the code for a sorted unicode range array as used in hb-ot-os2-unicode-ranges.hh +# Input is a tab seperated list of unicode ranges from the otspec +# (https://docs.microsoft.com/en-us/typography/opentype/spec/os2#ulunicoderange1). + +import io +import re +import sys + +reload(sys) +sys.setdefaultencoding('utf-8') + +print (u"""static Range os2UnicodeRangesSorted[] = +{""") + +args = sys.argv[1:] +input_file = args[0] + +with io.open(input_file, mode="r", encoding="utf-8") as f: + + all_ranges = []; + current_bit = 0 + while True: + line = f.readline().strip() + if not line: + break + fields = re.split(r'\t+', line) + if len(fields) == 3: + current_bit = fields[0] + fields = fields[1:] + elif len(fields) > 3: + raise Error("bad input :(.") + + name = fields[0] + ranges = re.split("-", fields[1]) + if len(ranges) != 2: + raise Error("bad input :(.") + + v = tuple((int(ranges[0], 16), int(ranges[1], 16), int(current_bit), name)) + all_ranges.append(v) + +all_ranges = sorted(all_ranges, key=lambda t: t[0]) + +for ranges in all_ranges: + start = ("0x%X" % ranges[0]).rjust(8) + end = ("0x%X" % ranges[1]).rjust(8) + bit = ("%s" % ranges[2]).rjust(3) + + print " {%s, %s, %s}, // %s" % (start, end, bit, ranges[3]) + +print (u"""};"""); diff --git a/src/hb-ot-os2-table.hh b/src/hb-ot-os2-table.hh index 2d9d214959cc87f4b8fe6f0868608884e0bc898a..6cb8d49495849ffeabf73360d73d966a5851b349 100644 --- a/src/hb-ot-os2-table.hh +++ b/src/hb-ot-os2-table.hh @@ -28,7 +28,7 @@ #define HB_OT_OS2_TABLE_HH #include "hb-open-type-private.hh" - +#include "hb-ot-os2-unicode-ranges.hh" namespace OT { @@ -67,11 +67,40 @@ struct os2 os2_prime->usFirstCharIndex.set (min_cp); os2_prime->usLastCharIndex.set (max_cp); + _update_unicode_ranges (plan->codepoints, os2_prime->ulUnicodeRange); bool result = hb_subset_plan_add_table(plan, HB_OT_TAG_os2, os2_prime_blob); + hb_blob_destroy (os2_prime_blob); return result; } + inline void _update_unicode_ranges (const hb_prealloced_array_t &codepoints, + HBUINT32 ulUnicodeRange[4]) const + { + for (unsigned int i = 0; i < 4; i++) + ulUnicodeRange[i].set (0); + + for (unsigned int i = 0; i < codepoints.len; i++) + { + hb_codepoint_t cp = codepoints[i]; + unsigned int bit = hb_get_unicode_range_bit (cp); + if (bit < 128) + { + unsigned int block = bit / 32; + unsigned int bit_in_block = bit % 32; + unsigned int mask = 1 << bit_in_block; + ulUnicodeRange[block].set (ulUnicodeRange[block] | mask); + } + if (cp >= 0x10000 && cp <= 0x110000) + { + /* the spec says that bit 57 ("Non Plane 0") implies that there's + at least one codepoint beyond the BMP; so I also include all + the non-BMP codepoints here */ + ulUnicodeRange[1].set (ulUnicodeRange[1] | (1 << 25)); + } + } + } + static inline void find_min_and_max_codepoint (const hb_prealloced_array_t &codepoints, uint16_t *min_cp, /* OUT */ uint16_t *max_cp /* OUT */) diff --git a/src/hb-ot-os2-unicode-ranges.hh b/src/hb-ot-os2-unicode-ranges.hh new file mode 100644 index 0000000000000000000000000000000000000000..2cf168f9c55ec8581f69ba8be051d264ac9c9484 --- /dev/null +++ b/src/hb-ot-os2-unicode-ranges.hh @@ -0,0 +1,247 @@ +/* + * Copyright © 2018 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Google Author(s): Garret Rieger + */ + +#ifndef HB_OT_OS2_UNICODE_RANGES_HH +#define HB_OT_OS2_UNICODE_RANGES_HH + +#include "hb-private.hh" +#include "hb-dsalgs.hh" + +namespace OT { + +struct Range { + hb_codepoint_t start; + hb_codepoint_t end; + unsigned int bit; +}; + +/* Note: The contents of this array was generated using src/gen-unicode-ranges.py. */ +static Range os2UnicodeRangesSorted[] = +{ + { 0x0, 0x7F, 0}, // Basic Latin + { 0x80, 0xFF, 1}, // Latin-1 Supplement + { 0x100, 0x17F, 2}, // Latin Extended-A + { 0x180, 0x24F, 3}, // Latin Extended-B + { 0x250, 0x2AF, 4}, // IPA Extensions + { 0x2B0, 0x2FF, 5}, // Spacing Modifier Letters + { 0x300, 0x36F, 6}, // Combining Diacritical Marks + { 0x370, 0x3FF, 7}, // Greek and Coptic + { 0x400, 0x4FF, 9}, // Cyrillic + { 0x500, 0x52F, 9}, // Cyrillic Supplement + { 0x530, 0x58F, 10}, // Armenian + { 0x590, 0x5FF, 11}, // Hebrew + { 0x600, 0x6FF, 13}, // Arabic + { 0x700, 0x74F, 71}, // Syriac + { 0x750, 0x77F, 13}, // Arabic Supplement + { 0x780, 0x7BF, 72}, // Thaana + { 0x7C0, 0x7FF, 14}, // NKo + { 0x900, 0x97F, 15}, // Devanagari + { 0x980, 0x9FF, 16}, // Bengali + { 0xA00, 0xA7F, 17}, // Gurmukhi + { 0xA80, 0xAFF, 18}, // Gujarati + { 0xB00, 0xB7F, 19}, // Oriya + { 0xB80, 0xBFF, 20}, // Tamil + { 0xC00, 0xC7F, 21}, // Telugu + { 0xC80, 0xCFF, 22}, // Kannada + { 0xD00, 0xD7F, 23}, // Malayalam + { 0xD80, 0xDFF, 73}, // Sinhala + { 0xE00, 0xE7F, 24}, // Thai + { 0xE80, 0xEFF, 25}, // Lao + { 0xF00, 0xFFF, 70}, // Tibetan + { 0x1000, 0x109F, 74}, // Myanmar + { 0x10A0, 0x10FF, 26}, // Georgian + { 0x1100, 0x11FF, 28}, // Hangul Jamo + { 0x1200, 0x137F, 75}, // Ethiopic + { 0x1380, 0x139F, 75}, // Ethiopic Supplement + { 0x13A0, 0x13FF, 76}, // Cherokee + { 0x1400, 0x167F, 77}, // Unified Canadian Aboriginal Syllabics + { 0x1680, 0x169F, 78}, // Ogham + { 0x16A0, 0x16FF, 79}, // Runic + { 0x1700, 0x171F, 84}, // Tagalog + { 0x1720, 0x173F, 84}, // Hanunoo + { 0x1740, 0x175F, 84}, // Buhid + { 0x1760, 0x177F, 84}, // Tagbanwa + { 0x1780, 0x17FF, 80}, // Khmer + { 0x1800, 0x18AF, 81}, // Mongolian + { 0x1900, 0x194F, 93}, // Limbu + { 0x1950, 0x197F, 94}, // Tai Le + { 0x1980, 0x19DF, 95}, // New Tai Lue + { 0x19E0, 0x19FF, 80}, // Khmer Symbols + { 0x1A00, 0x1A1F, 96}, // Buginese + { 0x1B00, 0x1B7F, 27}, // Balinese + { 0x1B80, 0x1BBF, 112}, // Sundanese + { 0x1C00, 0x1C4F, 113}, // Lepcha + { 0x1C50, 0x1C7F, 114}, // Ol Chiki + { 0x1D00, 0x1D7F, 4}, // Phonetic Extensions + { 0x1D80, 0x1DBF, 4}, // Phonetic Extensions Supplement + { 0x1DC0, 0x1DFF, 6}, // Combining Diacritical Marks Supplement + { 0x1E00, 0x1EFF, 29}, // Latin Extended Additional + { 0x1F00, 0x1FFF, 30}, // Greek Extended + { 0x2000, 0x206F, 31}, // General Punctuation + { 0x2070, 0x209F, 32}, // Superscripts And Subscripts + { 0x20A0, 0x20CF, 33}, // Currency Symbols + { 0x20D0, 0x20FF, 34}, // Combining Diacritical Marks For Symbols + { 0x2100, 0x214F, 35}, // Letterlike Symbols + { 0x2150, 0x218F, 36}, // Number Forms + { 0x2190, 0x21FF, 37}, // Arrows + { 0x2200, 0x22FF, 38}, // Mathematical Operators + { 0x2300, 0x23FF, 39}, // Miscellaneous Technical + { 0x2400, 0x243F, 40}, // Control Pictures + { 0x2440, 0x245F, 41}, // Optical Character Recognition + { 0x2460, 0x24FF, 42}, // Enclosed Alphanumerics + { 0x2500, 0x257F, 43}, // Box Drawing + { 0x2580, 0x259F, 44}, // Block Elements + { 0x25A0, 0x25FF, 45}, // Geometric Shapes + { 0x2600, 0x26FF, 46}, // Miscellaneous Symbols + { 0x2700, 0x27BF, 47}, // Dingbats + { 0x27C0, 0x27EF, 38}, // Miscellaneous Mathematical Symbols-A + { 0x27F0, 0x27FF, 37}, // Supplemental Arrows-A + { 0x2800, 0x28FF, 82}, // Braille Patterns + { 0x2900, 0x297F, 37}, // Supplemental Arrows-B + { 0x2980, 0x29FF, 38}, // Miscellaneous Mathematical Symbols-B + { 0x2A00, 0x2AFF, 38}, // Supplemental Mathematical Operators + { 0x2B00, 0x2BFF, 37}, // Miscellaneous Symbols and Arrows + { 0x2C00, 0x2C5F, 97}, // Glagolitic + { 0x2C60, 0x2C7F, 29}, // Latin Extended-C + { 0x2C80, 0x2CFF, 8}, // Coptic + { 0x2D00, 0x2D2F, 26}, // Georgian Supplement + { 0x2D30, 0x2D7F, 98}, // Tifinagh + { 0x2D80, 0x2DDF, 75}, // Ethiopic Extended + { 0x2DE0, 0x2DFF, 9}, // Cyrillic Extended-A + { 0x2E00, 0x2E7F, 31}, // Supplemental Punctuation + { 0x2E80, 0x2EFF, 59}, // CJK Radicals Supplement + { 0x2F00, 0x2FDF, 59}, // Kangxi Radicals + { 0x2FF0, 0x2FFF, 59}, // Ideographic Description Characters + { 0x3000, 0x303F, 48}, // CJK Symbols And Punctuation + { 0x3040, 0x309F, 49}, // Hiragana + { 0x30A0, 0x30FF, 50}, // Katakana + { 0x3100, 0x312F, 51}, // Bopomofo + { 0x3130, 0x318F, 52}, // Hangul Compatibility Jamo + { 0x3190, 0x319F, 59}, // Kanbun + { 0x31A0, 0x31BF, 51}, // Bopomofo Extended + { 0x31C0, 0x31EF, 61}, // CJK Strokes + { 0x31F0, 0x31FF, 50}, // Katakana Phonetic Extensions + { 0x3200, 0x32FF, 54}, // Enclosed CJK Letters And Months + { 0x3300, 0x33FF, 55}, // CJK Compatibility + { 0x3400, 0x4DBF, 59}, // CJK Unified Ideographs Extension A + { 0x4DC0, 0x4DFF, 99}, // Yijing Hexagram Symbols + { 0x4E00, 0x9FFF, 59}, // CJK Unified Ideographs + { 0xA000, 0xA48F, 83}, // Yi Syllables + { 0xA490, 0xA4CF, 83}, // Yi Radicals + { 0xA500, 0xA63F, 12}, // Vai + { 0xA640, 0xA69F, 9}, // Cyrillic Extended-B + { 0xA700, 0xA71F, 5}, // Modifier Tone Letters + { 0xA720, 0xA7FF, 29}, // Latin Extended-D + { 0xA800, 0xA82F, 100}, // Syloti Nagri + { 0xA840, 0xA87F, 53}, // Phags-pa + { 0xA880, 0xA8DF, 115}, // Saurashtra + { 0xA900, 0xA92F, 116}, // Kayah Li + { 0xA930, 0xA95F, 117}, // Rejang + { 0xAA00, 0xAA5F, 118}, // Cham + { 0xAC00, 0xD7AF, 56}, // Hangul Syllables + { 0xD800, 0xDFFF, 57}, // Non-Plane 0 * + { 0xE000, 0xF8FF, 60}, // Private Use Area (plane 0) + { 0xF900, 0xFAFF, 61}, // CJK Compatibility Ideographs + { 0xFB00, 0xFB4F, 62}, // Alphabetic Presentation Forms + { 0xFB50, 0xFDFF, 63}, // Arabic Presentation Forms-A + { 0xFE00, 0xFE0F, 91}, // Variation Selectors + { 0xFE10, 0xFE1F, 65}, // Vertical Forms + { 0xFE20, 0xFE2F, 64}, // Combining Half Marks + { 0xFE30, 0xFE4F, 65}, // CJK Compatibility Forms + { 0xFE50, 0xFE6F, 66}, // Small Form Variants + { 0xFE70, 0xFEFF, 67}, // Arabic Presentation Forms-B + { 0xFF00, 0xFFEF, 68}, // Halfwidth And Fullwidth Forms + { 0xFFF0, 0xFFFF, 69}, // Specials + { 0x10000, 0x1007F, 101}, // Linear B Syllabary + { 0x10080, 0x100FF, 101}, // Linear B Ideograms + { 0x10100, 0x1013F, 101}, // Aegean Numbers + { 0x10140, 0x1018F, 102}, // Ancient Greek Numbers + { 0x10190, 0x101CF, 119}, // Ancient Symbols + { 0x101D0, 0x101FF, 120}, // Phaistos Disc + { 0x10280, 0x1029F, 121}, // Lycian + { 0x102A0, 0x102DF, 121}, // Carian + { 0x10300, 0x1032F, 85}, // Old Italic + { 0x10330, 0x1034F, 86}, // Gothic + { 0x10380, 0x1039F, 103}, // Ugaritic + { 0x103A0, 0x103DF, 104}, // Old Persian + { 0x10400, 0x1044F, 87}, // Deseret + { 0x10450, 0x1047F, 105}, // Shavian + { 0x10480, 0x104AF, 106}, // Osmanya + { 0x10800, 0x1083F, 107}, // Cypriot Syllabary + { 0x10900, 0x1091F, 58}, // Phoenician + { 0x10920, 0x1093F, 121}, // Lydian + { 0x10A00, 0x10A5F, 108}, // Kharoshthi + { 0x12000, 0x123FF, 110}, // Cuneiform + { 0x12400, 0x1247F, 110}, // Cuneiform Numbers and Punctuation + { 0x1D000, 0x1D0FF, 88}, // Byzantine Musical Symbols + { 0x1D100, 0x1D1FF, 88}, // Musical Symbols + { 0x1D200, 0x1D24F, 88}, // Ancient Greek Musical Notation + { 0x1D300, 0x1D35F, 109}, // Tai Xuan Jing Symbols + { 0x1D360, 0x1D37F, 111}, // Counting Rod Numerals + { 0x1D400, 0x1D7FF, 89}, // Mathematical Alphanumeric Symbols + { 0x1F000, 0x1F02F, 122}, // Mahjong Tiles + { 0x1F030, 0x1F09F, 122}, // Domino Tiles + { 0x20000, 0x2A6DF, 59}, // CJK Unified Ideographs Extension B + { 0x2F800, 0x2FA1F, 61}, // CJK Compatibility Ideographs Supplement + { 0xE0000, 0xE007F, 92}, // Tags + { 0xE0100, 0xE01EF, 91}, // Variation Selectors Supplement + { 0xF0000, 0xFFFFD, 90}, // Private Use (plane 15) + {0x100000, 0x10FFFD, 90}, // Private Use (plane 16) +}; + +static int +_compare_range (const void *_key, const void *_item, void *_arg) +{ + hb_codepoint_t cp = *((hb_codepoint_t *) _key); + const Range *range = (Range *) _item; + + if (cp < range->start) + return -1; + else if (cp <= range->end) + return 0; + else + return 1; +} + +/** + * hb_get_unicode_range_bit: + * Returns the bit to be set in os/2 ulUnicodeRange for a given codepoint. + **/ +static unsigned int +hb_get_unicode_range_bit (hb_codepoint_t cp) +{ + Range *range = (Range*) hb_bsearch_r (&cp, os2UnicodeRangesSorted, + sizeof (os2UnicodeRangesSorted) / sizeof(Range), + sizeof(Range), + _compare_range, nullptr); + if (range != NULL) + return range->bit; + return -1; +} + +} /* namespace OT */ + +#endif /* HB_OT_OS2_UNICODE_RANGES_HH */ diff --git a/src/test-unicode-ranges.cc b/src/test-unicode-ranges.cc new file mode 100644 index 0000000000000000000000000000000000000000..16d01ef2096700819313fe94fde6104101c5f034 --- /dev/null +++ b/src/test-unicode-ranges.cc @@ -0,0 +1,67 @@ +/* + * Copyright © 2018 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Google Author(s): Garret Rieger + */ + +#include "hb-private.hh" + +#include "hb-ot-os2-unicode-ranges.hh" + +void +test (hb_codepoint_t cp, int bit) +{ + if (OT::hb_get_unicode_range_bit (cp) != bit) + { + fprintf (stderr, "got incorrect bit (%d) for cp 0x%X. Should have been %d.", + OT::hb_get_unicode_range_bit (cp), + cp, + bit); + abort(); + } +} + +void +test_get_unicode_range_bit (void) +{ + test (0x0000, 0); + test (0x0042, 0); + test (0x007F, 0); + test (0x0080, 1); + + test (0x30A0, 50); + test (0x30B1, 50); + test (0x30FF, 50); + + test (0x10FFFD, 90); + + test (0x30000, -1); + test (0x110000, -1); +} + +int +main (void) +{ + test_get_unicode_range_bit (); + return 0; +}