From b7b0b3595bcffe3f4d339dd80f96267f4509dd7a Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Thu, 2 Apr 2020 12:32:17 +0800 Subject: [PATCH] Add unittest for transformer prediction in dygraph_to_static (#23207) * Add unittest for transformer prediction in dygraph_to_static. * fix bug in fill_constant api. * Make transpose support size 0. test=develop --- paddle/fluid/operators/transpose_op.cu | 6 + paddle/fluid/operators/transpose_op.h | 8 +- python/paddle/fluid/layers/tensor.py | 5 +- .../dygraph_to_static/test_transformer.py | 143 ++++++++++++++- .../transformer_dygraph_model.py | 170 +++++++++++++++++- .../dygraph_to_static/transformer_util.py | 35 +++- 6 files changed, 344 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu index 3152c902b0..f2d39a35c3 100644 --- a/paddle/fluid/operators/transpose_op.cu +++ b/paddle/fluid/operators/transpose_op.cu @@ -667,6 +667,9 @@ class TransposeGPUKernel : public framework::OpKernel { auto* x = context.Input("X"); auto* out = context.Output("Out"); out->mutable_data(context.GetPlace()); + if (out->numel() == 0) { + return; + } std::vector axis = context.Attr>("axis"); int ndims = axis.size(); @@ -688,6 +691,9 @@ class TransposeGradGPUKernel : public framework::OpKernel { if (!x_grad) return; x_grad->mutable_data(context.GetPlace()); + if (x_grad->numel() == 0) { + return; + } std::vector axis = context.Attr>("axis"); std::vector reversed_axis(axis); diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index 9ed76d066f..557cb408ee 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -64,7 +64,9 @@ class TransposeKernel : public framework::OpKernel { auto* x = context.Input("X"); auto* out = context.Output("Out"); out->mutable_data(context.GetPlace()); - + if (out->numel() == 0) { + return; + } std::vector axis = context.Attr>("axis"); int ndims = axis.size(); auto& dev_ctx = context.template device_context(); @@ -83,6 +85,10 @@ class TransposeGradKernel : public framework::OpKernel { if (!x_grad) return; x_grad->mutable_data(context.GetPlace()); + if (x_grad->numel() == 0) { + return; + } + std::vector axis = context.Attr>("axis"); std::vector reversed_axis(axis); diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index c5b77a43ff..9efa313cf0 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -594,13 +594,12 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): shape)) else: shape = list(shape.numpy().astype(int)) - dtype = convert_np_dtype_to_dtype_(dtype) if out is None: out = _varbase_creator(dtype=dtype) core.ops.fill_constant(out, 'value', float(value), 'force_cpu', force_cpu, 'dtype', - dtype, 'str_value', attrs['str_value'], 'shape', - shape) + out.dtype, 'str_value', attrs['str_value'], + 'shape', shape) out.stop_gradient = True return out diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py index 604b24439a..7d4ccc45ae 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py @@ -21,6 +21,7 @@ import unittest import paddle.fluid as fluid import transformer_util as util +from transformer_dygraph_model import position_encoding_init from transformer_dygraph_model import Transformer from transformer_dygraph_model import CrossEntropyCriterion @@ -31,8 +32,8 @@ SEED = 10 def train_static(args, batch_generator): - train_prog = fluid.default_main_program() - startup_prog = fluid.default_startup_program() + train_prog = fluid.Program() + startup_prog = fluid.Program() train_prog.random_seed = SEED startup_prog.random_seed = SEED with fluid.program_guard(train_prog, startup_prog): @@ -117,9 +118,9 @@ def train_static(args, batch_generator): step_idx += 1 total_batch_num = total_batch_num + 1 if step_idx == 10: - if args.save_model: - model_path = os.path.join( - args.save_model, "step_" + str(step_idx), "transformer") + if args.save_dygraph_model_path: + model_path = os.path.join(args.save_static_model_path, + "transformer") fluid.save(train_prog, model_path) break return np.array(avg_loss) @@ -201,9 +202,8 @@ def train_dygraph(args, batch_generator): batch_id += 1 step_idx += 1 if step_idx == 10: - if args.save_model: - model_dir = os.path.join(args.save_model + '_dygraph', - "step_" + str(step_idx)) + if args.save_dygraph_model_path: + model_dir = os.path.join(args.save_dygraph_model_path) if not os.path.exists(model_dir): os.makedirs(model_dir) fluid.save_dygraph( @@ -218,18 +218,143 @@ def train_dygraph(args, batch_generator): return np.array(avg_loss) +def predict_dygraph(args, batch_generator): + with fluid.dygraph.guard(place): + fluid.default_main_program().random_seed = SEED + + # define data loader + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_batch_generator(batch_generator, places=place) + + # define model + transformer = Transformer( + args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, + args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, + args.d_inner_hid, args.prepostprocess_dropout, + args.attention_dropout, args.relu_dropout, args.preprocess_cmd, + args.postprocess_cmd, args.weight_sharing, args.bos_idx, + args.eos_idx) + + # load the trained model + model_dict, _ = util.load_dygraph( + os.path.join(args.save_dygraph_model_path, "transformer")) + # to avoid a longer length than training, reset the size of position + # encoding to max_length + model_dict["encoder.pos_encoder.weight"] = position_encoding_init( + args.max_length + 1, args.d_model) + model_dict["decoder.pos_encoder.weight"] = position_encoding_init( + args.max_length + 1, args.d_model) + transformer.load_dict(model_dict) + + # set evaluate mode + transformer.eval() + + step_idx = 0 + for input_data in test_loader(): + (src_word, src_pos, src_slf_attn_bias, trg_word, + trg_src_attn_bias) = input_data + finished_seq, finished_scores = transformer.beam_search( + src_word, + src_pos, + src_slf_attn_bias, + trg_word, + trg_src_attn_bias, + bos_id=args.bos_idx, + eos_id=args.eos_idx, + beam_size=args.beam_size, + max_len=args.max_out_len) + finished_seq = finished_seq.numpy() + finished_scores = finished_scores.numpy() + step_idx += 1 + if step_idx == 10: + break + return finished_seq + + +def predict_static(args, batch_generator): + test_prog = fluid.Program() + with fluid.program_guard(test_prog): + test_prog.random_seed = SEED + + # define input and reader + input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields + input_descs = util.get_input_descs(args, 'test') + input_slots = [{ + "name": name, + "shape": input_descs[name][0], + "dtype": input_descs[name][1] + } for name in input_field_names] + + input_field = util.InputField(input_slots) + feed_list = input_field.feed_list + loader = fluid.io.DataLoader.from_generator( + feed_list=feed_list, capacity=10) + + # define model + transformer = Transformer( + args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, + args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, + args.d_inner_hid, args.prepostprocess_dropout, + args.attention_dropout, args.relu_dropout, args.preprocess_cmd, + args.postprocess_cmd, args.weight_sharing, args.bos_idx, + args.eos_idx) + + out_ids, out_scores = transformer.beam_search( + *feed_list, + bos_id=args.bos_idx, + eos_id=args.eos_idx, + beam_size=args.beam_size, + max_len=args.max_out_len) + + # This is used here to set dropout to the test mode. + test_prog = test_prog.clone(for_test=True) + + # define the executor and program for training + exe = fluid.Executor(place) + + util.load(test_prog, + os.path.join(args.save_static_model_path, "transformer"), exe) + + loader.set_batch_generator(batch_generator, places=place) + + step_idx = 0 + for feed_dict in loader: + seq_ids, seq_scores = exe.run( + test_prog, + feed=feed_dict, + fetch_list=[out_ids.name, out_scores.name], + return_numpy=True) + step_idx += 1 + if step_idx == 10: + break + return seq_ids + + class TestTransformer(unittest.TestCase): def prepare(self, mode='train'): args = util.ModelHyperParams() batch_generator = util.get_feed_data_reader(args, mode) return args, batch_generator - def test_train(self): + def _test_train(self): args, batch_generator = self.prepare(mode='train') static_avg_loss = train_static(args, batch_generator) dygraph_avg_loss = train_dygraph(args, batch_generator) self.assertTrue(np.allclose(static_avg_loss, dygraph_avg_loss)) + def _test_predict(self): + args, batch_generator = self.prepare(mode='test') + static_res = predict_static(args, batch_generator) + dygraph_res = predict_dygraph(args, batch_generator) + self.assertTrue( + np.allclose(static_res, dygraph_res), + msg="static_res: {} \n dygraph_res: {}".format(static_res, + dygraph_res)) + + def test_check_result(self): + self._test_train() + self._test_predict() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index a7ce8740d1..3d9f94191d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -18,8 +18,10 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers -from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer +from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable from paddle.fluid.dygraph.jit import dygraph_to_static_func +from paddle.fluid.layers.utils import map_structure +from paddle.fluid.framework import Program, Block, Variable, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode def position_encoding_init(n_position, d_pos_vec): @@ -486,3 +488,169 @@ class Transformer(Layer): predict = self.decoder(trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output) return predict + + @dygraph_to_static_func + def beam_search(self, + src_word, + src_pos, + src_slf_attn_bias, + trg_word, + trg_src_attn_bias, + bos_id=0, + eos_id=1, + beam_size=4, + max_len=256): + def expand_to_beam_size(tensor, beam_size): + tensor = layers.reshape( + tensor, [tensor.shape[0], 1] + list(tensor.shape[1:])) + tile_dims = [1] * len(tensor.shape) + tile_dims[1] = beam_size + return layers.expand(tensor, tile_dims) + + def merge_batch_beams(tensor): + var_dim_in_state = 2 # count in beam dim + tensor = layers.transpose( + tensor, + list(range(var_dim_in_state, len(tensor.shape))) + + list(range(0, var_dim_in_state))) + + tensor = layers.reshape(tensor, + [0] * (len(tensor.shape) - var_dim_in_state + ) + [batch_size * beam_size]) + res = layers.transpose( + tensor, + list( + range((len(tensor.shape) + 1 - var_dim_in_state), + len(tensor.shape))) + + list(range(0, (len(tensor.shape) + 1 - var_dim_in_state)))) + return res + + def split_batch_beams(tensor): + var_dim_in_state = 1 + tensor = layers.transpose( + tensor, + list(range(var_dim_in_state, len(tensor.shape))) + + list(range(0, var_dim_in_state))) + tensor = layers.reshape(tensor, + [0] * (len(tensor.shape) - var_dim_in_state + ) + [batch_size, beam_size]) + res = layers.transpose( + tensor, + list( + range((len(tensor.shape) - 1 - var_dim_in_state), + len(tensor.shape))) + + list(range(0, (len(tensor.shape) - 1 - var_dim_in_state)))) + return res + + def mask_probs(probs, finished, noend_mask_tensor): + finished = layers.cast(finished, dtype=probs.dtype) + probs = layers.elementwise_mul( + layers.expand( + layers.unsqueeze(finished, [2]), + [1, 1, self.trg_vocab_size]), + noend_mask_tensor, + axis=-1) - layers.elementwise_mul( + probs, (finished - 1), axis=0) + return probs + + def gather(input, indices, batch_pos): + topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) + return layers.gather_nd(input, topk_coordinates) + + # run encoder + enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) + batch_size = enc_output.shape[0] + + # constant number + inf = float(1. * 1e7) + max_len = (enc_output.shape[1] + 20) if max_len is None else max_len + vocab_size_tensor = layers.fill_constant( + shape=[1], dtype="int64", value=self.trg_vocab_size) + end_token_tensor = to_variable( + np.full( + [batch_size, beam_size], eos_id, dtype="int64")) + noend_array = [-inf] * self.trg_vocab_size + noend_array[eos_id] = 0 + noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) + batch_pos = layers.expand( + layers.unsqueeze( + to_variable(np.arange( + 0, batch_size, 1, dtype="int64")), [1]), [1, beam_size]) + predict_ids = [] + parent_ids = [] + ### initialize states of beam search ### + log_probs = to_variable( + np.array( + [[0.] + [-inf] * (beam_size - 1)] * batch_size, + dtype="float32")) + + finished = fluid.layers.fill_constant( + shape=[batch_size, beam_size], value=0, dtype="bool") + + trg_word = layers.fill_constant( + shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id) + + trg_src_attn_bias = merge_batch_beams( + expand_to_beam_size(trg_src_attn_bias, beam_size)) + enc_output = merge_batch_beams( + expand_to_beam_size(enc_output, beam_size)) + + # init states (caches) for transformer, need to be updated according to selected beam + caches = [{ + "k": layers.fill_constant( + shape=[batch_size, beam_size, self.n_head, 0, self.d_key], + dtype=enc_output.dtype, + value=0), + "v": layers.fill_constant( + shape=[batch_size, beam_size, self.n_head, 0, self.d_value], + dtype=enc_output.dtype, + value=0), + } for i in range(self.n_layer)] + + for i in range(max_len): + trg_pos = layers.zeros_like( + trg_word) + i # TODO: modified for dygraph2static + caches = map_structure(merge_batch_beams, + caches) # TODO: modified for dygraph2static + logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, + enc_output, caches) + caches = map_structure(split_batch_beams, caches) + step_log_probs = split_batch_beams( + fluid.layers.log(fluid.layers.softmax(logits))) + + step_log_probs = mask_probs(step_log_probs, finished, + noend_mask_tensor) + log_probs = layers.elementwise_add( + x=step_log_probs, y=log_probs, axis=0) + log_probs = layers.reshape(log_probs, + [-1, beam_size * self.trg_vocab_size]) + scores = log_probs + topk_scores, topk_indices = fluid.layers.topk( + input=scores, k=beam_size) + beam_indices = fluid.layers.elementwise_floordiv(topk_indices, + vocab_size_tensor) + token_indices = fluid.layers.elementwise_mod(topk_indices, + vocab_size_tensor) + + # update states + caches = map_structure(lambda x: gather(x, beam_indices, batch_pos), + caches) + log_probs = gather(log_probs, topk_indices, batch_pos) + finished = gather(finished, beam_indices, batch_pos) + finished = layers.logical_or( + finished, layers.equal(token_indices, end_token_tensor)) + trg_word = layers.reshape(token_indices, [-1, 1]) + + predict_ids.append(token_indices) + parent_ids.append(beam_indices) + + if layers.reduce_all(finished).numpy(): + break + + predict_ids = layers.stack(predict_ids, axis=0) + parent_ids = layers.stack(parent_ids, axis=0) + finished_seq = layers.transpose( + layers.gather_tree(predict_ids, parent_ids), [1, 2, 0]) + finished_scores = topk_scores + + return finished_seq, finished_scores diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py index d2e00d85c7..8ebb99fda6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py @@ -23,13 +23,13 @@ import paddle.fluid as fluid import paddle.dataset.wmt16 as wmt16 -def get_input_descs(args): +def get_input_descs(args, mode="train"): batch_size = args.batch_size # TODO None(before) seq_len = None n_head = getattr(args, "n_head", 8) d_model = getattr(args, "d_model", 512) - input_descs = { + input_descs_train = { "src_word": [(batch_size, seq_len), "int64", 2], "src_pos": [(batch_size, seq_len), "int64"], "src_slf_attn_bias": @@ -46,7 +46,24 @@ def get_input_descs(args): "init_score": [(batch_size, 1), "float32", 2], "init_idx": [(batch_size, ), "int32"], } - return input_descs + input_descs_predict = { + "src_word": [(batch_size, seq_len), "int64", 2], + "src_pos": [(batch_size, seq_len), "int64"], + "src_slf_attn_bias": + [(batch_size, n_head, seq_len, seq_len), "float32"], + "trg_word": [(batch_size, seq_len), "int64", 2], + "trg_pos": [(batch_size, seq_len), "int64"], + "trg_slf_attn_bias": + [(batch_size, n_head, seq_len, seq_len), "float32"], + "trg_src_attn_bias": [(batch_size, n_head, 1, seq_len), "float32"], + "enc_output": [(batch_size, seq_len, d_model), "float32"], + "lbl_word": [(None, 1), "int64"], + "lbl_weight": [(None, 1), "float32"], + "init_score": [(batch_size, 1), "float32", 2], + "init_idx": [(batch_size, ), "int32"], + } + + return input_descs_train if mode == "train" else input_descs_predict encoder_data_input_fields = ( @@ -69,8 +86,8 @@ fast_decoder_data_input_fields = ( class ModelHyperParams(object): print_step = 2 - init_from_params = "trained_models/step_10/" - save_model = "trained_models" + save_dygraph_model_path = "dygraph_trained_models" + save_static_model_path = "static_trained_models" inference_model_dir = "infer_model" output_file = "predict.txt" batch_size = 5 @@ -82,10 +99,10 @@ class ModelHyperParams(object): warmup_steps = 8000 label_smooth_eps = 0.1 beam_size = 5 - max_out_len = 256 + max_out_len = 5 # small number to avoid the unittest timeout n_best = 1 - src_vocab_size = 10000 - trg_vocab_size = 10000 + src_vocab_size = 36556 + trg_vocab_size = 36556 bos_idx = 0 # index for token eos_idx = 1 # index for token unk_idx = 2 # index for token @@ -214,7 +231,7 @@ def get_feed_data_reader(args, mode='train'): def __for_test__(): test_reader = paddle.batch( - wmt16.train(args.src_vocab_size, args.trg_vocab_size), + wmt16.test(args.src_vocab_size, args.trg_vocab_size), batch_size=args.batch_size) for batch in test_reader(): tensors = prepare_infer_input(batch, args.eos_idx, args.eos_idx, -- GitLab