From 844c7becdfdbc4a0da704e874501a6452450cecd Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 9 Apr 2018 07:02:51 +0000 Subject: [PATCH] fix bug --- .../benmark/machine_translation.py | 349 ++++++++++++++++++ .../transformer/nmt_fluid.py | 45 ++- .../transformer/optim.py | 4 + 3 files changed, 388 insertions(+), 10 deletions(-) create mode 100644 fluid/neural_machine_translation/benmark/machine_translation.py diff --git a/fluid/neural_machine_translation/benmark/machine_translation.py b/fluid/neural_machine_translation/benmark/machine_translation.py new file mode 100644 index 00000000..5b9e6237 --- /dev/null +++ b/fluid/neural_machine_translation/benmark/machine_translation.py @@ -0,0 +1,349 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=16, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=True, + help="Whether to use gpu. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, + decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_val)) + + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + pass + + +if __name__ == '__main__': + args = parser.parse_args() + if args.infer_only: + infer() + else: + train() diff --git a/fluid/neural_machine_translation/transformer/nmt_fluid.py b/fluid/neural_machine_translation/transformer/nmt_fluid.py index b1e9473a..b2a796a8 100644 --- a/fluid/neural_machine_translation/transformer/nmt_fluid.py +++ b/fluid/neural_machine_translation/transformer/nmt_fluid.py @@ -11,6 +11,7 @@ from model import transformer, position_encoding_init from optim import LearningRateScheduler from config import TrainTaskConfig, ModelHyperParams, pos_enc_param_names, \ encoder_input_data_names, decoder_input_data_names, label_data_names +import paddle.fluid.debuger as debuger def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): @@ -117,6 +118,7 @@ def pad_batch_data(insts, def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, max_length, n_head): + print("input_data_name:", input_data_names) """ Put all padded data needed by training into a dict. """ @@ -150,6 +152,7 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, lbl_word, lbl_weight ])) + #print("input_dict", input_dict) return input_dict @@ -171,7 +174,8 @@ def main(): TrainTaskConfig.warmup_steps, place, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam( - learning_rate=lr_scheduler.learning_rate, + #learning_rate=lr_scheduler.learning_rate, + learning_rate=TrainTaskConfig.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) @@ -185,7 +189,6 @@ def main(): def test(exe): test_costs = [] - #for batch_id, data in enumerate(val_data()): for batch_id, data in enumerate(test_reader()): if len(data) != args.batch_size: continue @@ -207,6 +210,7 @@ def main(): ts = time.time() for pass_id in xrange(args.pass_num): for batch_id, data in enumerate(train_reader()): + print("batch_id:", batch_id) # The current program desc is coupled with batch_size, thus all # mini-batches must have the same number of instances currently. if len(data) != args.batch_size: @@ -219,16 +223,17 @@ def main(): ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, ModelHyperParams.n_head) - lr_scheduler.update_learning_rate(data_input) + #print("feed0:", data_input) + #print("fetch_list0:", [cost]) - outs = exe.run(fluid.framework.default_main_program(), - feed=data_input, - fetch_list=[cost], - use_program_cache=True) + lr_scheduler.update_learning_rate(data_input) + print("before exe run in train_loop") + outs = exe.run(trainer_prog, + feed=data_input, + fetch_list=[cost], + use_program_cache=True) cost_val = np.array(outs[0]) - #print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + - # " cost = " + str(cost_val) + "Speed = %.2f img/s") print("pass_id = %d batch = %d cost = %f speed = %.2f sample/s" % (pass_id, batch_id, cost_val, len(data) / (time.time() - start_time))) @@ -242,7 +247,10 @@ def main(): if args.local: # Initialize the parameters. exe.run(fluid.framework.default_startup_program()) + #print("local start_up:") + #print(debuger.pprint_program_codes(fluid.framework.default_startup_program())) for pos_enc_param_name in pos_enc_param_names: + #print("pos_enc_param_name:", pos_enc_param_name) pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( @@ -290,9 +298,23 @@ def main(): exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": + #print("cost 0:", cost) + #print("before run start up") + # Parameter initialization exe.run(fluid.default_startup_program()) + #print("cluster start_up:") + #print(debuger.pprint_program_codes(fluid.framework.default_startup_program())) + + for pos_enc_param_name in pos_enc_param_names: + #print("pos_enc_param_name:", pos_enc_param_name) + pos_enc_param = fluid.global_scope().find_var( + pos_enc_param_name).get_tensor() + pos_enc_param.set( + position_encoding_init(ModelHyperParams.max_length + 1, + ModelHyperParams.d_model), place) + train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt16.train(ModelHyperParams.src_vocab_size, @@ -305,10 +327,13 @@ def main(): ModelHyperParams.trg_vocab_size), batch_size=args.batch_size) + #print("before get trainer program") trainer_prog = t.get_trainer_program() + #print("before start") # feeder = fluid.DataFeeder(feed_list=[images, label], place=place) # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver - exe.run(fluid.default_startup_program()) + # exe.run(fluid.default_startup_program()) + train_loop(exe, trainer_prog) else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER") diff --git a/fluid/neural_machine_translation/transformer/optim.py b/fluid/neural_machine_translation/transformer/optim.py index 9905e659..7f494e38 100644 --- a/fluid/neural_machine_translation/transformer/optim.py +++ b/fluid/neural_machine_translation/transformer/optim.py @@ -28,6 +28,7 @@ class LearningRateScheduler(object): dtype="float32", persistable=True) self.place = place + #print("LearningRateScheduler init learning_rate_name:", self.learning_rate.name) def update_learning_rate(self, data_input): self.current_steps += 1 @@ -37,4 +38,7 @@ class LearningRateScheduler(object): ]) lr_tensor = fluid.LoDTensor() lr_tensor.set(np.array([lr_value], dtype="float32"), self.place) + #print("in learning_rate") + #print("learning_rate_name:", self.learning_rate.name) + #print("data_input:", data_input) data_input[self.learning_rate.name] = lr_tensor -- GitLab