diff --git a/fluid/neural_machine_translation/transformer/util.py b/fluid/neural_machine_translation/transformer/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..77dc3868d894d485b59b67d5d75f5a08a53dbb3c
--- /dev/null
+++ b/fluid/neural_machine_translation/transformer/util.py
@@ -0,0 +1,73 @@
+import sys
+import re
+import six
+import unicodedata
+
+# Regular expression for unescaping token strings.
+# '\u' is converted to '_'
+# '\\' is converted to '\'
+# '\213;' is converted to unichr(213)
+# Inverse of escaping.
+_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
+
+# This set contains all letter and number characters.
+_ALPHANUMERIC_CHAR_SET = set(
+    six.unichr(i) for i in range(sys.maxunicode)
+    if (unicodedata.category(six.unichr(i)).startswith("L") or
+        unicodedata.category(six.unichr(i)).startswith("N")))
+
+
+def tokens_to_ustr(tokens):
+    """
+    Convert a list of tokens to a unicode string.
+    """
+    token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+    ret = []
+    for i, token in enumerate(tokens):
+        if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+            ret.append(u" ")
+        ret.append(token)
+    return "".join(ret)
+
+
+def subtoken_ids_to_tokens(subtoken_ids, vocabs):
+    """
+    Convert a list of subtoken(wordpiece) ids to a list of tokens.
+    """
+    concatenated = "".join(
+        [vocabs.get(subtoken_id, u"") for subtoken_id in subtoken_ids])
+    split = concatenated.split("_")
+    ret = []
+    for t in split:
+        if t:
+            unescaped = unescape_token(t + "_")
+            if unescaped:
+                ret.append(unescaped)
+    return ret
+
+
+def unescape_token(escaped_token):
+    """
+    Inverse of encoding escaping.
+    """
+
+    def match(m):
+        if m.group(1) is None:
+            return u"_" if m.group(0) == u"\\u" else u"\\"
+
+        try:
+            return six.unichr(int(m.group(1)))
+        except (ValueError, OverflowError) as _:
+            return u"\u3013"  # Unicode for undefined character.
+
+    trimmed = escaped_token[:-1] if escaped_token.endswith(
+        "_") else escaped_token
+    return _UNESCAPE_REGEX.sub(match, trimmed)
+
+
+def subword_ids_to_str(ids, vocabs):
+    """
+    Convert a list of subtoken(word piece) ids to a native string.
+    Refer to SubwordTextEncoder in Tensor2Tensor. 
+    """
+    return tokens_to_ustr(subtoken_ids_to_tokens(ids, vocabs)).decode("utf-8")