Merge pull request #1190 from guoshengCS/support-py3-transformer

Support python3 in Transformer

Merge pull request #1190 from guoshengCS/support-py3-transformer
Support python3 in Transformer
f3afe346 · Guo Sheng · GitHub · ff63e48f · 5c0b25f8 · f3afe346
6 changed file
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -115,11 +115,11 @@ seq_len = ModelHyperParams.max_length
 # compile time.
 input_descs = {
    # The actual data shape of src_word is:
-    # [batch_size * max_src_len_in_batch, 1]
-    "src_word": [(batch_size, seq_len, 1L), "int64", 2],
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_word": [(batch_size, seq_len, 1), "int64", 2],
    # The actual data shape of src_pos is:
-    # [batch_size * max_src_len_in_batch, 1]
-    "src_pos": [(batch_size, seq_len, 1L), "int64"],
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_pos": [(batch_size, seq_len, 1), "int64"],
    # This input is used to remove attention weights on paddings in the
    # encoder.
    # The actual data shape of src_slf_attn_bias is:
@@ -127,12 +127,12 @@ input_descs = {
    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
                           seq_len), "float32"],
    # The actual data shape of trg_word is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "trg_word": [(batch_size, seq_len, 1L), "int64",
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_word": [(batch_size, seq_len, 1), "int64",
                 2],  # lod_level is only used in fast decoder.
    # The actual data shape of trg_pos is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "trg_pos": [(batch_size, seq_len, 1L), "int64"],
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_pos": [(batch_size, seq_len, 1), "int64"],
    # This input is used to remove attention weights on paddings and
    # subsequent words in the decoder.
    # The actual data shape of trg_slf_attn_bias is:
@@ -151,15 +151,13 @@ input_descs = {
    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
    # The actual data shape of label_word is:
    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_word": [(batch_size * seq_len, 1L), "int64"],
+    "lbl_word": [(batch_size * seq_len, 1), "int64"],
    # This input is used to mask out the loss of paddding tokens.
    # The actual data shape of label_weight is:
    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_weight": [(batch_size * seq_len, 1L), "float32"],
-    # These inputs are used to change the shape tensor in beam-search decoder.
-    "trg_slf_attn_pre_softmax_shape_delta": [(2L, ), "int32"],
-    "trg_slf_attn_post_softmax_shape_delta": [(4L, ), "int32"],
-    "init_score": [(batch_size, 1L), "float32"],
+    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
+    # This input is used in beam-search decoder.
+    "init_score": [(batch_size, 1), "float32"],
 }

 # Names of word embedding table which might be reused for weight sharing.

--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -59,8 +59,7 @@ def parse_args():
        "provided in util.py to do this.")
    parser.add_argument(
        "--token_delimiter",
-        type=partial(
-            str.decode, encoding="string-escape"),
+        type=lambda x: str(x.encode().decode("unicode-escape")),
        default=" ",
        help="The delimiter used to split tokens in source or target sentences. "
        "For EN-DE BPE data we provided, use spaces as token delimiter.; "
@@ -99,11 +98,11 @@ def post_process_seq(seq,
        if idx == eos_idx:
            eos_pos = i
            break
-    seq = seq[:eos_pos + 1]
-    return filter(
-        lambda idx: (output_bos or idx != bos_idx) and \
-            (output_eos or idx != eos_idx),
-        seq)
+    seq = [
+        idx for idx in seq[:eos_pos + 1]
+        if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)
+    ]
+    return seq


 def prepare_batch_input(insts, data_input_names, src_pad_idx, bos_idx, n_head,
@@ -164,8 +163,10 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece):
    fluid.io.load_vars(
        exe,
        InferTaskConfig.model_path,
-        vars=filter(lambda var: isinstance(var, fluid.framework.Parameter),
-                    fluid.default_main_program().list_vars()))
+        vars=[
+            var for var in fluid.default_main_program().list_vars()
+            if isinstance(var, fluid.framework.Parameter)
+        ])

    # This is used here to set dropout to the test mode.
    infer_program = fluid.default_main_program().inference_optimize()
@@ -203,7 +204,7 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece):
                    post_process_seq(np.array(seq_ids)[sub_start:sub_end]),
                    trg_idx2word))
                scores[i].append(np.array(seq_scores)[sub_end - 1])
-                print hyps[i][-1]
+                print(hyps[i][-1])
                if len(hyps[i]) >= InferTaskConfig.n_best:
                    break


--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -12,7 +12,7 @@ def position_encoding_init(n_position, d_pos_vec):
    Generate the initial values for the sinusoid position encoding table.
    """
    position_enc = np.array([[
-        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        pos / np.power(10000, 2. * (j // 2) / d_pos_vec)
        for j in range(d_pos_vec)
    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
@@ -90,8 +90,7 @@ def multi_head_attention(queries,
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        return layers.reshape(
-            x=trans_x,
-            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+            x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]])

    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """

--- a/fluid/neural_machine_translation/transformer/reader.py
+++ b/fluid/neural_machine_translation/transformer/reader.py
 import glob
 import os
-import random
 import tarfile
-import cPickle
+
+import numpy as np


 class SortType(object):
@@ -204,7 +204,8 @@ class DataReader(object):
        self._token_delimiter = token_delimiter
        self.load_src_trg_ids(end_mark, fpattern, start_mark, tar_fname,
                              unk_mark)
-        self._random = random.Random(x=seed)
+        self._random = np.random
+        self._random.seed(seed)

    def load_src_trg_ids(self, end_mark, fpattern, start_mark, tar_fname,
                         unk_mark):

--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -2,8 +2,8 @@ import argparse
 import ast
 import multiprocessing
 import os
+import six
 import time
-from functools import partial

 import numpy as np
 import paddle.fluid as fluid
@@ -78,8 +78,7 @@ def parse_args():
        help="The <bos>, <eos> and <unk> tokens in the dictionary.")
    parser.add_argument(
        "--token_delimiter",
-        type=partial(
-            str.decode, encoding="string-escape"),
+        type=lambda x: str(x.encode().decode("unicode-escape")),
        default=" ",
        help="The delimiter used to split tokens in source or target sentences. "
        "For EN-DE BPE data we provided, use spaces as token delimiter. "
@@ -138,8 +137,6 @@ def pad_batch_data(insts,
    """
    return_list = []
    max_len = max(len(inst) for inst in insts)
-    num_token = reduce(lambda x, y: x + y,
-                       [len(inst) for inst in insts]) if return_num_token else 0
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.
    inst_data = np.array(
@@ -151,7 +148,7 @@ def pad_batch_data(insts,
        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
    else:  # position data
        inst_pos = np.array([
-            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
            for inst in insts
        ])
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -176,6 +173,9 @@ def pad_batch_data(insts,
    if return_max_len:
        return_list += [max_len]
    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
        return_list += [num_token]
    return return_list if len(return_list) > 1 else return_list[0]

@@ -323,7 +323,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
-        print "init fluid.framework.default_startup_program"
+        print("init fluid.framework.default_startup_program")
        exe.run(fluid.framework.default_startup_program())

    train_data = reader.DataReader(
@@ -371,8 +371,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
         )) + TrainTaskConfig.label_smooth_eps *
                        np.log(TrainTaskConfig.label_smooth_eps / (
                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
+
+    step_idx = 0
+    inst_num = 0
    init = False
-    for pass_id in xrange(TrainTaskConfig.pass_num):
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            feed_list = []
@@ -387,11 +390,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                    ModelHyperParams.d_model)
                total_num_token += num_token
-                feed_kv_pairs = data_input_dict.items()
+                inst_num += len(data_buffer)
+                feed_kv_pairs = list(data_input_dict.items())
                if args.local:
-                    feed_kv_pairs += {
+                    feed_kv_pairs += list({
                        lr_scheduler.learning_rate.name: lr_rate
-                    }.items()
+                    }.items())
                feed_list.append(dict(feed_kv_pairs))

                if not init:
@@ -409,14 +413,17 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
            )  # sum the cost from multi-devices
            total_token_num = token_num_val.sum()
            total_avg_cost = total_sum_cost / total_token_num
-            print("epoch: %d, batch: %d, avg loss: %f, normalized loss: %f,"
-                  " ppl: %f" % (pass_id, batch_id, total_avg_cost,
-                                total_avg_cost - loss_normalizer,
-                                np.exp([min(total_avg_cost, 100)])))
+            print(
+                "step_idx: %d, total samples: %d, epoch: %d, batch: %d, avg loss: %f, "
+                "normalized loss: %f, ppl: %f" %
+                (step_idx, inst_num, pass_id, batch_id, total_avg_cost,
+                 total_avg_cost - loss_normalizer,
+                 np.exp([min(total_avg_cost, 100)])))
            if batch_id > 0 and batch_id % 1000 == 0:
                fluid.io.save_persistables(
                    exe,
                    os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint"))
+            step_idx += 1
            init = True

        time_consumed = time.time() - pass_start_time
@@ -449,7 +456,7 @@ def train(args):
    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
    if is_local == '0':
        args.local = False
-    print args
+    print(args)

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False
@@ -530,7 +537,7 @@ def train(args):
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

-            print "psserver begin run"
+            print("psserver begin run")
            with open('pserver_startup.desc', 'w') as f:
                f.write(str(pserver_startup))
            with open('pserver_prog.desc', 'w') as f:

--- a/fluid/neural_machine_translation/transformer/util.py
+++ b/fluid/neural_machine_translation/transformer/util.py
@@ -17,6 +17,35 @@ _ALPHANUMERIC_CHAR_SET = set(
        unicodedata.category(six.unichr(i)).startswith("N")))


+# Unicode utility functions that work with Python 2 and 3
+def native_to_unicode(s):
+    return s if is_unicode(s) else to_unicode(s)
+
+
+def unicode_to_native(s):
+    if six.PY2:
+        return s.encode("utf-8") if is_unicode(s) else s
+    else:
+        return s
+
+
+def is_unicode(s):
+    if six.PY2:
+        if isinstance(s, unicode):
+            return True
+    else:
+        if isinstance(s, str):
+            return True
+    return False
+
+
+def to_unicode(s, ignore_errors=False):
+    if is_unicode(s):
+        return s
+    error_mode = "ignore" if ignore_errors else "strict"
+    return s.decode("utf-8", errors=error_mode)
+
+
 def unescape_token(escaped_token):
    """
    Inverse of encoding escaping.
@@ -44,9 +73,7 @@ def subtoken_ids_to_str(subtoken_ids, vocabs):
    subtokens = [vocabs.get(subtoken_id, u"") for subtoken_id in subtoken_ids]

    # Convert a list of subtokens to a list of tokens.
-    concatenated = "".join([
-        t if isinstance(t, unicode) else t.decode("utf-8") for t in subtokens
-    ])
+    concatenated = "".join([native_to_unicode(t) for t in subtokens])
    split = concatenated.split("_")
    tokens = []
    for t in split:
@@ -65,4 +92,4 @@ def subtoken_ids_to_str(subtoken_ids, vocabs):
        ret.append(token)
    seq = "".join(ret)

-    return seq.encode("utf-8")
+    return unicode_to_native(seq)