diff --git a/fluid/neural_machine_translation/transformer/util.py b/fluid/neural_machine_translation/transformer/util.py new file mode 100644 index 0000000000000000000000000000000000000000..77dc3868d894d485b59b67d5d75f5a08a53dbb3c --- /dev/null +++ b/fluid/neural_machine_translation/transformer/util.py @@ -0,0 +1,73 @@ +import sys +import re +import six +import unicodedata + +# Regular expression for unescaping token strings. +# '\u' is converted to '_' +# '\\' is converted to '\' +# '\213;' is converted to unichr(213) +# Inverse of escaping. +_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") + +# This set contains all letter and number characters. +_ALPHANUMERIC_CHAR_SET = set( + six.unichr(i) for i in range(sys.maxunicode) + if (unicodedata.category(six.unichr(i)).startswith("L") or + unicodedata.category(six.unichr(i)).startswith("N"))) + + +def tokens_to_ustr(tokens): + """ + Convert a list of tokens to a unicode string. + """ + token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] + ret = [] + for i, token in enumerate(tokens): + if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: + ret.append(u" ") + ret.append(token) + return "".join(ret) + + +def subtoken_ids_to_tokens(subtoken_ids, vocabs): + """ + Convert a list of subtoken(wordpiece) ids to a list of tokens. + """ + concatenated = "".join( + [vocabs.get(subtoken_id, u"") for subtoken_id in subtoken_ids]) + split = concatenated.split("_") + ret = [] + for t in split: + if t: + unescaped = unescape_token(t + "_") + if unescaped: + ret.append(unescaped) + return ret + + +def unescape_token(escaped_token): + """ + Inverse of encoding escaping. + """ + + def match(m): + if m.group(1) is None: + return u"_" if m.group(0) == u"\\u" else u"\\" + + try: + return six.unichr(int(m.group(1))) + except (ValueError, OverflowError) as _: + return u"\u3013" # Unicode for undefined character. + + trimmed = escaped_token[:-1] if escaped_token.endswith( + "_") else escaped_token + return _UNESCAPE_REGEX.sub(match, trimmed) + + +def subword_ids_to_str(ids, vocabs): + """ + Convert a list of subtoken(word piece) ids to a native string. + Refer to SubwordTextEncoder in Tensor2Tensor. + """ + return tokens_to_ustr(subtoken_ids_to_tokens(ids, vocabs)).decode("utf-8")