diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py index 0650f04e656367aa01c83cffba62d40e6b757735..34ece762d43126063bf481086d0d46494f0e7f33 100644 --- a/dygraph/mobilenet/train.py +++ b/dygraph/mobilenet/train.py @@ -47,8 +47,11 @@ def eval(net, test_data_loader, eop): t_last = 0 place_num = paddle.fluid.core.get_cuda_device_count( ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1)) + + batch_start = time.time() for img, label in test_data_loader(): - t1 = time.time() + batch_reader_end = time.time() + label = to_variable(label.numpy().astype('int64').reshape( int(args.batch_size // place_num), 1)) out = net(img) @@ -57,15 +60,19 @@ def eval(net, test_data_loader, eop): avg_loss = fluid.layers.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) - t2 = time.time() - print( "test | epoch id: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec read_t:%2.4f" % \ - (eop, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), t2 - t1 , t1 - t_last)) - sys.stdout.flush() total_loss += avg_loss.numpy() total_acc1 += acc_top1.numpy() total_acc5 += acc_top5.numpy() + + test_batch_cost = time.time() - batch_start total_sample += 1 - t_last = time.time() + print( + "test | epoch %d, avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s" + % (eop, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), + test_batch_cost, batch_reader_end - batch_start)) + sys.stdout.flush() + batch_start = time.time() + print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \ (total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample)) @@ -148,9 +155,12 @@ def train_mobilenet(): # 4. train loop total_batch_num = 0 #this is for benchmark for eop in range(args.num_epochs): + epoch_start = time.time() + if num_trainers > 1: imagenet_reader.set_shuffle_seed(eop + ( args.random_seed if args.random_seed else 0)) + net.train() total_loss = 0.0 total_acc1 = 0.0 @@ -158,24 +168,23 @@ def train_mobilenet(): total_sample = 0 batch_id = 0 t_last = 0 + # 4.1 for each batch, call net() , backward(), and minimize() + batch_start = time.time() for img, label in train_data_loader(): - t1 = time.time() if args.max_iter and total_batch_num == args.max_iter: return - t_start = time.time() + batch_reader_end = time.time() # 4.1.1 call net() out = net(img) - - t_end = time.time() softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy( input=softmax_out, label=label) avg_loss = fluid.layers.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) - t_start_back = time.time() + batch_net_end = time.time() # 4.1.2 call backward() if args.use_data_parallel: @@ -184,37 +193,43 @@ def train_mobilenet(): net.apply_collective_grads() else: avg_loss.backward() - - t_end_back = time.time() + batch_backward_end = time.time() # 4.1.3 call minimize() optimizer.minimize(avg_loss) net.clear_gradients() t2 = time.time() - train_batch_elapse = t2 - t1 - if batch_id % args.print_step == 0: - print( "epoch id: %d, batch step: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec net_t:%2.4f back_t:%2.4f read_t:%2.4f" % \ - (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), train_batch_elapse, - t_end - t_start, t_end_back - t_start_back, t1 - t_last)) - sys.stdout.flush() total_loss += avg_loss.numpy() total_acc1 += acc_top1.numpy() total_acc5 += acc_top5.numpy() total_sample += 1 batch_id += 1 - t_last = time.time() # NOTE: used for benchmark + train_batch_cost = time.time() - batch_start total_batch_num = total_batch_num + 1 + if batch_id % args.print_step == 0: + print( + "[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s" + % (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), + acc_top5.numpy(), train_batch_cost, + batch_net_end - batch_reader_end, + batch_backward_end - batch_net_end, + batch_reader_end - batch_start)) + sys.stdout.flush() + batch_start = time.time() if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) - print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \ - (eop, batch_id, total_loss / total_sample, \ - total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse)) + + train_epoch_cost = time.time() - epoch_start + print( + "[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s" + % (eop, total_loss / total_sample, total_acc1 / total_sample, + total_acc5 / total_sample, train_epoch_cost)) # 4.2 save checkpoint save_parameters = (not args.use_data_parallel) or ( diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index 0a8ed9494e16937ac1fac4068e37b2a6415212bb..f38a8c93209e9aa4e3bab8c403744bedeb465425 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -32,9 +32,6 @@ import time from args import * -#import fluid.clip as clip -#from fluid.clip import * - import sys if sys.version[0] == '2': reload(sys) @@ -386,9 +383,11 @@ def train_ptb_lm(): ce_time = [] ce_ppl = [] - + total_batch_num = 0 #this is for benchmark for epoch_id in range(max_epoch): + epoch_start = time.time() + ptb_model.train() total_loss = 0.0 iters = 0.0 @@ -405,11 +404,11 @@ def train_ptb_lm(): init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) - start_time = time.time() + + batch_start = time.time() for batch_id, batch in enumerate(train_data_loader): if args.max_iter and total_batch_num == args.max_iter: return - batch_start = time.time() x, y = batch dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, @@ -423,23 +422,26 @@ def train_ptb_lm(): ptb_model.clear_gradients() total_loss += out_loss - batch_end = time.time() - train_batch_cost = batch_end - batch_start iters += num_steps - total_batch_num = total_batch_num + 1 #this is for benchmark + total_batch_num = total_batch_num + 1 #this is for benchmark + train_batch_cost = time.time() - batch_start if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) - print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" % - (epoch_id, batch_id, ppl[0], - sgd._global_learning_rate().numpy(), out_loss, train_batch_cost)) + print( + "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s" + % (epoch_id, batch_id, ppl[0], + sgd._global_learning_rate().numpy(), out_loss, + train_batch_cost)) + batch_start = time.time() - print("one epoch finished", epoch_id) - print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) - ce_time.append(time.time() - start_time) + train_epoch_cost = time.time() - epoch_start + print("-- Epoch:[%d]; ppl: %.5f, epoch_cost: %.5f s" % + (epoch_id, ppl[0], train_epoch_cost)) + + ce_time.append(train_epoch_cost) ce_ppl.append(ppl[0]) - print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: # for bad init, after first epoch, the loss is over 1000 diff --git a/dygraph/resnet/reader.py b/dygraph/resnet/reader.py index 92db91b8a880359289de99ce24b2348aae88dac8..ccb9fae2b5fe6ea6a7d29601a2585a4b80d749e0 100644 --- a/dygraph/resnet/reader.py +++ b/dygraph/resnet/reader.py @@ -190,7 +190,10 @@ class ImageNetReader: full_lines = [line.strip() for line in flist] if mode != "test" and len(full_lines) < settings.batch_size: print( - "Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!" + "Warning: The number of the whole data ({}) is smaller " + "than the batch_size ({}), and drop_last is turnning on, " + "so nothing will feed in program, Terminated now. Please " + "reset batch_size to a smaller number or feed more data!" .format(len(full_lines), settings.batch_size)) os._exit(1) if shuffle: diff --git a/dygraph/resnet/train.py b/dygraph/resnet/train.py index a6780feebc3499e2450a67f6316cb09a331c4753..c6a9c888ca946fbfc65216592ec19a7aaae32ae7 100644 --- a/dygraph/resnet/train.py +++ b/dygraph/resnet/train.py @@ -331,8 +331,6 @@ def eval(model, data): label.stop_gradient = True out = model(img) - #loss = fluid.layers.cross_entropy(input=out, label=label) - #avg_loss = fluid.layers.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) @@ -344,7 +342,6 @@ def eval(model, data): total_acc5 += acc_top5.numpy() total_sample += 1 - # print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) if batch_id % 10 == 0: print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \ ( batch_id, total_acc1 / total_sample, total_acc5 / total_sample)) @@ -413,13 +410,11 @@ def train_resnet(): use_multiprocess=True) test_loader.set_sample_list_generator(test_reader, places=place) - #file_name = './model/epoch_0.npz' - #model_data = np.load( file_name ) - #NOTE: used in benchmark total_batch_num = 0 for eop in range(epoch): + epoch_start = time.time() resnet.train() total_loss = 0.0 @@ -427,17 +422,13 @@ def train_resnet(): total_acc5 = 0.0 total_sample = 0 - #dict_state = resnet.state_dict() - - #resnet.load_dict( model_data ) - - print("load finished") - + batch_start = time.time() for batch_id, data in enumerate(train_loader()): #NOTE: used in benchmark if args.max_iter and total_batch_num == args.max_iter: return - batch_start = time.time() + + train_reader_cost = time.time() - batch_start img, label = data label.stop_gradient = True @@ -461,26 +452,32 @@ def train_resnet(): optimizer.minimize(avg_loss) resnet.clear_gradients() - batch_end = time.time() - train_batch_cost = batch_end - batch_start total_loss += dy_out total_acc1 += acc_top1.numpy() total_acc5 += acc_top5.numpy() total_sample += 1 + + train_batch_cost = time.time() - batch_start total_batch_num = total_batch_num + 1 #this is for benchmark - #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) if batch_id % 10 == 0: - print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f, batch cost: %.5f" % \ - ( eop, batch_id, total_loss / total_sample, \ - total_acc1 / total_sample, total_acc5 / total_sample, train_batch_cost)) + print( + "[Epoch %d, batch %d] loss %.5f, acc1 %.5f, acc5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s" + % (eop, batch_id, total_loss / total_sample, + total_acc1 / total_sample, total_acc5 / total_sample, + train_batch_cost, train_reader_cost)) + batch_start = time.time() if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) - print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ - (eop, batch_id, total_loss / total_sample, \ - total_acc1 / total_sample, total_acc5 / total_sample)) + + train_epoch_cost = time.time() - epoch_start + print( + "[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s" + % (eop, total_loss / total_sample, total_acc1 / total_sample, + total_acc5 / total_sample, train_epoch_cost)) + resnet.eval() eval(resnet, test_loader) diff --git a/dygraph/seq2seq/README.md b/dygraph/seq2seq/README.md old mode 100755 new mode 100644 diff --git a/dygraph/seq2seq/__init__.py b/dygraph/seq2seq/__init__.py old mode 100755 new mode 100644 diff --git a/dygraph/seq2seq/args.py b/dygraph/seq2seq/args.py old mode 100755 new mode 100644 diff --git a/dygraph/seq2seq/attention_model.py b/dygraph/seq2seq/attention_model.py old mode 100755 new mode 100644 index bf8c16ad6552218bb759505ff087dcc41bdf6c01..33b61d3e66ffad4a193e5eb3ad36d0738960acae --- a/dygraph/seq2seq/attention_model.py +++ b/dygraph/seq2seq/attention_model.py @@ -30,6 +30,7 @@ alpha = 0.6 uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) zero_constant = fluid.initializer.Constant(0.0) + class AttentionModel(fluid.dygraph.Layer): def __init__(self, hidden_size, @@ -79,55 +80,61 @@ class AttentionModel(fluid.dygraph.Layer): self.enc_units = [] for i in range(num_layers): self.enc_units.append( - self.add_sublayer("enc_units_%d" % i, + self.add_sublayer( + "enc_units_%d" % i, BasicLSTMUnit( - hidden_size=self.hidden_size, - input_size=self.hidden_size, - param_attr=param_attr, - bias_attr=bias_attr, - forget_bias=forget_bias))) + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) self.dec_units = [] for i in range(num_layers): if i == 0: self.dec_units.append( - self.add_sublayer("dec_units_%d" % i, + self.add_sublayer( + "dec_units_%d" % i, BasicLSTMUnit( - hidden_size=self.hidden_size, - input_size=self.hidden_size * 2, - param_attr=param_attr, - bias_attr=bias_attr, - forget_bias=forget_bias))) + hidden_size=self.hidden_size, + input_size=self.hidden_size * 2, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) else: self.dec_units.append( - self.add_sublayer("dec_units_%d" % i, - BasicLSTMUnit( - hidden_size=self.hidden_size, - input_size=self.hidden_size, - param_attr=param_attr, - bias_attr=bias_attr, - forget_bias=forget_bias))) - - self.fc = fluid.dygraph.nn.Linear(self.hidden_size, - self.tar_vocab_size, - param_attr=param_attr, - bias_attr=False) - - self.attn_fc = fluid.dygraph.nn.Linear(self.hidden_size, - self.hidden_size, - param_attr=param_attr, - bias_attr=False) - - self.concat_fc = fluid.dygraph.nn.Linear(2 * self.hidden_size, - self.hidden_size, - param_attr=param_attr, - bias_attr=False) + self.add_sublayer( + "dec_units_%d" % i, + BasicLSTMUnit( + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + + self.fc = fluid.dygraph.nn.Linear( + self.hidden_size, + self.tar_vocab_size, + param_attr=param_attr, + bias_attr=False) + + self.attn_fc = fluid.dygraph.nn.Linear( + self.hidden_size, + self.hidden_size, + param_attr=param_attr, + bias_attr=False) + + self.concat_fc = fluid.dygraph.nn.Linear( + 2 * self.hidden_size, + self.hidden_size, + param_attr=param_attr, + bias_attr=False) def _transpose_batch_time(self, x): return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _merge_batch_beams(self, x): - return fluid.layers.reshape(x, shape=(-1,x.shape[2])) + return fluid.layers.reshape(x, shape=(-1, x.shape[2])) def tile_beam_merge_with_batch(self, x): x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...] @@ -135,7 +142,7 @@ class AttentionModel(fluid.dygraph.Layer): expand_times[1] = self.beam_size x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...] x = fluid.layers.transpose(x, list(range(2, len(x.shape))) + - [0, 1]) # [..., batch_size, beam_size] + [0, 1]) # [..., batch_size, beam_size] # use 0 to copy to avoid wrong shape x = fluid.layers.reshape( x, shape=[0] * @@ -159,7 +166,7 @@ class AttentionModel(fluid.dygraph.Layer): new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \ fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0) return new_state - + def _gather(self, x, indices, batch_pos): topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) return fluid.layers.gather_nd(x, topk_coordinates) @@ -184,12 +191,19 @@ class AttentionModel(fluid.dygraph.Layer): if src.shape[0] < self.batch_size: self.batch_size = src.shape[0] src_emb = self.src_embeder(self._transpose_batch_time(src)) - - enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) - enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) + + enc_hidden = to_variable( + np.zeros( + (self.num_layers, self.batch_size, self.hidden_size), + dtype='float32')) + enc_cell = to_variable( + np.zeros( + (self.num_layers, self.batch_size, self.hidden_size), + dtype='float32')) max_seq_len = src_emb.shape[0] - enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32") + enc_len_mask = fluid.layers.sequence_mask( + src_sequence_length, maxlen=max_seq_len, dtype="float32") enc_padding_mask = (enc_len_mask - 1.0) enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) enc_states = [[enc_hidden, enc_cell]] @@ -200,7 +214,8 @@ class AttentionModel(fluid.dygraph.Layer): enc_hidden, enc_cell = enc_states[l] new_enc_hidden, new_enc_cell = [], [] for i in range(self.num_layers): - new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i]) + new_hidden, new_cell = self.enc_units[i]( + step_input, enc_hidden[i], enc_cell[i]) new_enc_hidden.append(new_hidden) new_enc_cell.append(new_cell) if self.dropout != None and self.dropout > 0.0: @@ -210,17 +225,25 @@ class AttentionModel(fluid.dygraph.Layer): dropout_implementation='upscale_in_train') else: step_input = new_hidden - new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)] - new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)] + new_enc_hidden = [ + self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) + for i in range(self.num_layers) + ] + new_enc_cell = [ + self._real_state(enc_cell[i], new_enc_cell[i], step_mask) + for i in range(self.num_layers) + ] enc_states.append([new_enc_hidden, new_enc_cell]) enc_outputs.append(step_input) enc_outputs = fluid.layers.stack(enc_outputs) enc_outputs = self._transpose_batch_time(enc_outputs) - if self.mode in ['train', 'eval']: + if self.mode in ['train', 'eval']: # calculation with input_feed derives from paper: https://arxiv.org/pdf/1508.04025.pdf - input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32')) - + input_feed = to_variable( + np.zeros( + (self.batch_size, self.hidden_size), dtype='float32')) + dec_hidden, dec_cell = enc_states[-1] tar_emb = self.tar_embeder(self._transpose_batch_time(tar)) max_seq_len = tar_emb.shape[0] @@ -231,7 +254,8 @@ class AttentionModel(fluid.dygraph.Layer): step_input = fluid.layers.concat([step_input, input_feed], 1) new_dec_hidden, new_dec_cell = [], [] for i in range(self.num_layers): - new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_hidden, new_cell = self.dec_units[i]( + step_input, dec_hidden[i], dec_cell[i]) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) @@ -243,7 +267,8 @@ class AttentionModel(fluid.dygraph.Layer): else: step_input = new_hidden - dec_att = self.attention(step_input, enc_outputs, enc_padding_mask) + dec_att = self.attention(step_input, enc_outputs, + enc_padding_mask) dec_att = fluid.layers.squeeze(dec_att, [1]) concat_att_out = fluid.layers.concat([dec_att, step_input], 1) out = self.concat_fc(concat_att_out) @@ -253,9 +278,9 @@ class AttentionModel(fluid.dygraph.Layer): dec_output = fluid.layers.stack(dec_output) dec_output = self.fc(self._transpose_batch_time(dec_output)) - + loss = fluid.layers.softmax_with_cross_entropy( - logits=dec_output, label=label, soft_label=False) + logits=dec_output, label=label, soft_label=False) loss = fluid.layers.squeeze(loss, axes=[2]) max_tar_seq_len = fluid.layers.shape(tar)[1] tar_mask = fluid.layers.sequence_mask( @@ -264,28 +289,45 @@ class AttentionModel(fluid.dygraph.Layer): loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) return loss - + elif self.mode in ['beam_search']: enc_outputs = self.tile_beam_merge_with_batch(enc_outputs) enc_padding_mask = self.tile_beam_merge_with_batch(enc_padding_mask) batch_beam_shape = (self.batch_size, self.beam_size) vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size)) - start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64')) - end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64')) + start_token_tensor = to_variable( + np.full( + batch_beam_shape, self.beam_start_token, dtype='int64')) + end_token_tensor = to_variable( + np.full( + batch_beam_shape, self.beam_end_token, dtype='int64')) step_input = self.tar_embeder(start_token_tensor) - input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32')) + input_feed = to_variable( + np.zeros( + (self.batch_size, self.hidden_size), dtype='float32')) input_feed = self._expand_to_beam_size(input_feed) input_feed = self._merge_batch_beams(input_feed) - beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32')) - beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")) - beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1]) - + beam_finished = to_variable( + np.full( + batch_beam_shape, 0, dtype='float32')) + beam_state_log_probs = to_variable( + np.array( + [[0.] + [-self.kinf] * (self.beam_size - 1)], + dtype="float32")) + beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, + [self.batch_size, 1]) + dec_hidden, dec_cell = enc_states[-1] - dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden] + dec_hidden = [ + self._expand_to_beam_size(state) for state in dec_hidden + ] dec_cell = [self._expand_to_beam_size(state) for state in dec_cell] - + batch_pos = fluid.layers.expand( - fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]), + fluid.layers.unsqueeze( + to_variable( + np.arange( + 0, self.batch_size, 1, dtype="int64")), [1]), [1, self.beam_size]) predicted_ids = [] parent_ids = [] @@ -296,11 +338,16 @@ class AttentionModel(fluid.dygraph.Layer): step_input = self._merge_batch_beams(step_input) step_input = fluid.layers.concat([step_input, input_feed], 1) new_dec_hidden, new_dec_cell = [], [] - dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden] - dec_cell = [self._merge_batch_beams(state) for state in dec_cell] + dec_hidden = [ + self._merge_batch_beams(state) for state in dec_hidden + ] + dec_cell = [ + self._merge_batch_beams(state) for state in dec_cell + ] for i in range(self.num_layers): - new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_hidden, new_cell = self.dec_units[i]( + step_input, dec_hidden[i], dec_cell[i]) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) if self.dropout != None and self.dropout > 0.0: @@ -310,17 +357,23 @@ class AttentionModel(fluid.dygraph.Layer): dropout_implementation='upscale_in_train') else: step_input = new_hidden - dec_att = self.attention(step_input, enc_outputs, enc_padding_mask) + dec_att = self.attention(step_input, enc_outputs, + enc_padding_mask) dec_att = fluid.layers.squeeze(dec_att, [1]) concat_att_out = fluid.layers.concat([dec_att, step_input], 1) out = self.concat_fc(concat_att_out) input_feed = out cell_outputs = self._split_batch_beams(out) cell_outputs = self.fc(cell_outputs) - step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs)) + step_log_probs = fluid.layers.log( + fluid.layers.softmax(cell_outputs)) noend_array = [-self.kinf] * self.tar_vocab_size - noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] - noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32')) + noend_array[ + self. + beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] + noend_mask_tensor = to_variable( + np.array( + noend_array, dtype='float32')) # set finished position to one-hot probability of step_log_probs = fluid.layers.elementwise_mul( fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]), @@ -328,20 +381,38 @@ class AttentionModel(fluid.dygraph.Layer): fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0) log_probs = fluid.layers.elementwise_add( x=step_log_probs, y=beam_state_log_probs, axis=0) - scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size]) - topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size) - beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam - token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam - next_log_probs = self._gather(scores, topk_indices, batch_pos) # - - new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden] - new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell] - new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden] - new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell] - - next_finished = self._gather(beam_finished, beam_indices, batch_pos) + scores = fluid.layers.reshape( + log_probs, [-1, self.beam_size * self.tar_vocab_size]) + topk_scores, topk_indices = fluid.layers.topk( + input=scores, k=self.beam_size) + beam_indices = fluid.layers.elementwise_floordiv( + topk_indices, vocab_size_tensor) # in which beam + token_indices = fluid.layers.elementwise_mod( + topk_indices, vocab_size_tensor) # position in beam + next_log_probs = self._gather(scores, topk_indices, + batch_pos) # + + new_dec_hidden = [ + self._split_batch_beams(state) for state in new_dec_hidden + ] + new_dec_cell = [ + self._split_batch_beams(state) for state in new_dec_cell + ] + new_dec_hidden = [ + self._gather(x, beam_indices, batch_pos) + for x in new_dec_hidden + ] + new_dec_cell = [ + self._gather(x, beam_indices, batch_pos) + for x in new_dec_cell + ] + + next_finished = self._gather(beam_finished, beam_indices, + batch_pos) next_finished = fluid.layers.cast(next_finished, "bool") - next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor)) + next_finished = fluid.layers.logical_or( + next_finished, + fluid.layers.equal(token_indices, end_token_tensor)) next_finished = fluid.layers.cast(next_finished, "float32") # prepare for next step dec_hidden, dec_cell = new_dec_hidden, new_dec_cell diff --git a/dygraph/seq2seq/base_model.py b/dygraph/seq2seq/base_model.py old mode 100755 new mode 100644 index 6319223502cebd32369b65b8ae60d6569b9ad48a..028df514960e7aefa4d9f5c6efd41e323c15cc4d --- a/dygraph/seq2seq/base_model.py +++ b/dygraph/seq2seq/base_model.py @@ -30,6 +30,7 @@ alpha = 0.6 uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) zero_constant = fluid.initializer.Constant(0.0) + class BaseModel(fluid.dygraph.Layer): def __init__(self, hidden_size, @@ -77,35 +78,38 @@ class BaseModel(fluid.dygraph.Layer): self.enc_units = [] for i in range(num_layers): self.enc_units.append( - self.add_sublayer("enc_units_%d" % i, + self.add_sublayer( + "enc_units_%d" % i, BasicLSTMUnit( - hidden_size=self.hidden_size, - input_size=self.hidden_size, - param_attr=param_attr, - bias_attr=bias_attr, - forget_bias=forget_bias))) + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) self.dec_units = [] for i in range(num_layers): self.dec_units.append( - self.add_sublayer("dec_units_%d" % i, + self.add_sublayer( + "dec_units_%d" % i, BasicLSTMUnit( - hidden_size=self.hidden_size, - input_size=self.hidden_size, - param_attr=param_attr, - bias_attr=bias_attr, - forget_bias=forget_bias))) - - self.fc = fluid.dygraph.nn.Linear(self.hidden_size, - self.tar_vocab_size, - param_attr=param_attr, - bias_attr=False) + hidden_size=self.hidden_size, + input_size=self.hidden_size, + param_attr=param_attr, + bias_attr=bias_attr, + forget_bias=forget_bias))) + + self.fc = fluid.dygraph.nn.Linear( + self.hidden_size, + self.tar_vocab_size, + param_attr=param_attr, + bias_attr=False) def _transpose_batch_time(self, x): return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _merge_batch_beams(self, x): - return fluid.layers.reshape(x, shape=(-1,x.shape[2])) + return fluid.layers.reshape(x, shape=(-1, x.shape[2])) def _split_batch_beams(self, x): return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1])) @@ -135,11 +139,18 @@ class BaseModel(fluid.dygraph.Layer): self.batch_size = src.shape[0] src_emb = self.src_embeder(self._transpose_batch_time(src)) - enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) - enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) + enc_hidden = to_variable( + np.zeros( + (self.num_layers, self.batch_size, self.hidden_size), + dtype='float32')) + enc_cell = to_variable( + np.zeros( + (self.num_layers, self.batch_size, self.hidden_size), + dtype='float32')) max_seq_len = src_emb.shape[0] - enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32") + enc_len_mask = fluid.layers.sequence_mask( + src_sequence_length, maxlen=max_seq_len, dtype="float32") enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) enc_states = [[enc_hidden, enc_cell]] for l in range(max_seq_len): @@ -148,7 +159,8 @@ class BaseModel(fluid.dygraph.Layer): enc_hidden, enc_cell = enc_states[l] new_enc_hidden, new_enc_cell = [], [] for i in range(self.num_layers): - new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i]) + new_hidden, new_cell = self.enc_units[i]( + step_input, enc_hidden[i], enc_cell[i]) new_enc_hidden.append(new_hidden) new_enc_cell.append(new_cell) if self.dropout != None and self.dropout > 0.0: @@ -158,10 +170,16 @@ class BaseModel(fluid.dygraph.Layer): dropout_implementation='upscale_in_train') else: step_input = new_hidden - new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)] - new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)] + new_enc_hidden = [ + self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) + for i in range(self.num_layers) + ] + new_enc_cell = [ + self._real_state(enc_cell[i], new_enc_cell[i], step_mask) + for i in range(self.num_layers) + ] enc_states.append([new_enc_hidden, new_enc_cell]) - + if self.mode in ['train', 'eval']: dec_hidden, dec_cell = enc_states[-1] tar_emb = self.tar_embeder(self._transpose_batch_time(tar)) @@ -172,7 +190,8 @@ class BaseModel(fluid.dygraph.Layer): step_input = tar_emb[step_idx] new_dec_hidden, new_dec_cell = [], [] for i in range(self.num_layers): - new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_hidden, new_cell = self.dec_units[i]( + step_input, dec_hidden[i], dec_cell[i]) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) if self.dropout != None and self.dropout > 0.0: @@ -187,9 +206,9 @@ class BaseModel(fluid.dygraph.Layer): dec_output = fluid.layers.stack(dec_output) dec_output = self.fc(self._transpose_batch_time(dec_output)) - + loss = fluid.layers.softmax_with_cross_entropy( - logits=dec_output, label=label, soft_label=False) + logits=dec_output, label=label, soft_label=False) loss = fluid.layers.squeeze(loss, axes=[2]) max_tar_seq_len = fluid.layers.shape(tar)[1] tar_mask = fluid.layers.sequence_mask( @@ -202,19 +221,34 @@ class BaseModel(fluid.dygraph.Layer): batch_beam_shape = (self.batch_size, self.beam_size) #batch_beam_shape_1 = (self.batch_size, self.beam_size, 1) vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size)) - start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64')) - end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64')) + start_token_tensor = to_variable( + np.full( + batch_beam_shape, self.beam_start_token, dtype='int64')) + end_token_tensor = to_variable( + np.full( + batch_beam_shape, self.beam_end_token, dtype='int64')) step_input = self.tar_embeder(start_token_tensor) - beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32')) - beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")) - beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1]) - + beam_finished = to_variable( + np.full( + batch_beam_shape, 0, dtype='float32')) + beam_state_log_probs = to_variable( + np.array( + [[0.] + [-self.kinf] * (self.beam_size - 1)], + dtype="float32")) + beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, + [self.batch_size, 1]) + dec_hidden, dec_cell = enc_states[-1] - dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden] + dec_hidden = [ + self._expand_to_beam_size(state) for state in dec_hidden + ] dec_cell = [self._expand_to_beam_size(state) for state in dec_cell] - + batch_pos = fluid.layers.expand( - fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]), + fluid.layers.unsqueeze( + to_variable( + np.arange( + 0, self.batch_size, 1, dtype="int64")), [1]), [1, self.beam_size]) predicted_ids = [] parent_ids = [] @@ -224,11 +258,16 @@ class BaseModel(fluid.dygraph.Layer): break step_input = self._merge_batch_beams(step_input) new_dec_hidden, new_dec_cell = [], [] - dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden] - dec_cell = [self._merge_batch_beams(state) for state in dec_cell] + dec_hidden = [ + self._merge_batch_beams(state) for state in dec_hidden + ] + dec_cell = [ + self._merge_batch_beams(state) for state in dec_cell + ] for i in range(self.num_layers): - new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) + new_hidden, new_cell = self.dec_units[i]( + step_input, dec_hidden[i], dec_cell[i]) new_dec_hidden.append(new_hidden) new_dec_cell.append(new_cell) if self.dropout != None and self.dropout > 0.0: @@ -239,12 +278,17 @@ class BaseModel(fluid.dygraph.Layer): else: step_input = new_hidden cell_outputs = self._split_batch_beams(step_input) - cell_outputs = self.fc(cell_outputs) + cell_outputs = self.fc(cell_outputs) # Beam_search_step: - step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs)) + step_log_probs = fluid.layers.log( + fluid.layers.softmax(cell_outputs)) noend_array = [-self.kinf] * self.tar_vocab_size - noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] - noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32')) + noend_array[ + self. + beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] + noend_mask_tensor = to_variable( + np.array( + noend_array, dtype='float32')) # set finished position to one-hot probability of step_log_probs = fluid.layers.elementwise_mul( fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]), @@ -252,26 +296,45 @@ class BaseModel(fluid.dygraph.Layer): fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0) log_probs = fluid.layers.elementwise_add( x=step_log_probs, y=beam_state_log_probs, axis=0) - scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size]) - topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size) - beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam - token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam - next_log_probs = self._gather(scores, topk_indices, batch_pos) # + scores = fluid.layers.reshape( + log_probs, [-1, self.beam_size * self.tar_vocab_size]) + topk_scores, topk_indices = fluid.layers.topk( + input=scores, k=self.beam_size) + beam_indices = fluid.layers.elementwise_floordiv( + topk_indices, vocab_size_tensor) # in which beam + token_indices = fluid.layers.elementwise_mod( + topk_indices, vocab_size_tensor) # position in beam + next_log_probs = self._gather(scores, topk_indices, + batch_pos) # + + new_dec_hidden = [ + self._split_batch_beams(state) for state in new_dec_hidden + ] + new_dec_cell = [ + self._split_batch_beams(state) for state in new_dec_cell + ] + new_dec_hidden = [ + self._gather(x, beam_indices, batch_pos) + for x in new_dec_hidden + ] + new_dec_cell = [ + self._gather(x, beam_indices, batch_pos) + for x in new_dec_cell + ] - new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden] - new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell] - new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden] - new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell] - - next_finished = self._gather(beam_finished, beam_indices, batch_pos) + next_finished = self._gather(beam_finished, beam_indices, + batch_pos) next_finished = fluid.layers.cast(next_finished, "bool") - next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor)) + next_finished = fluid.layers.logical_or( + next_finished, + fluid.layers.equal(token_indices, end_token_tensor)) next_finished = fluid.layers.cast(next_finished, "float32") # prepare for next step dec_hidden, dec_cell = new_dec_hidden, new_dec_cell beam_finished = next_finished beam_state_log_probs = next_log_probs - step_input = self.tar_embeder(token_indices) # remove unsqueeze in v1.7 + step_input = self.tar_embeder( + token_indices) # remove unsqueeze in v1.7 predicted_ids.append(token_indices) parent_ids.append(beam_indices) diff --git a/dygraph/seq2seq/download.py b/dygraph/seq2seq/download.py old mode 100755 new mode 100644 diff --git a/dygraph/seq2seq/infer.py b/dygraph/seq2seq/infer.py old mode 100755 new mode 100644 index 43fbf965ce949e54fa0f0c3e69470533b3a0ab83..37ab9445c07703a14a8b17604858caee975ae1fa --- a/dygraph/seq2seq/infer.py +++ b/dygraph/seq2seq/infer.py @@ -74,7 +74,7 @@ def infer(): src_vocab_size, tar_vocab_size, batch_size, - beam_size = args.beam_size, + beam_size=args.beam_size, num_layers=num_layers, init_scale=init_scale, dropout=0.0, @@ -85,7 +85,7 @@ def infer(): src_vocab_size, tar_vocab_size, batch_size, - beam_size = args.beam_size, + beam_size=args.beam_size, num_layers=num_layers, init_scale=init_scale, dropout=0.0, @@ -97,17 +97,17 @@ def infer(): infer_data = reader.raw_mono_data(source_vocab_file, infer_file) def prepare_input(batch, epoch_id=0): - src_ids, src_mask, tar_ids, tar_mask = batch - res = {} - src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) - in_tar = tar_ids[:, :-1] - label_tar = tar_ids[:, 1:] - - in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) - label_tar = label_tar.reshape( - (label_tar.shape[0], label_tar.shape[1], 1)) - inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask] - return inputs, np.sum(tar_mask) + src_ids, src_mask, tar_ids, tar_mask = batch + res = {} + src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) + in_tar = tar_ids[:, :-1] + label_tar = tar_ids[:, 1:] + + in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) + label_tar = label_tar.reshape( + (label_tar.shape[0], label_tar.shape[1], 1)) + inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask] + return inputs, np.sum(tar_mask) dir_name = args.reload_model print("dir name", dir_name) @@ -115,7 +115,8 @@ def infer(): model.set_dict(state_dict) model.eval() - train_data_iter = reader.get_data_iter(infer_data, batch_size, mode='infer') + train_data_iter = reader.get_data_iter( + infer_data, batch_size, mode='infer') tar_id2vocab = [] tar_vocab_file = args.vocab_prefix + "." + args.tar_lang diff --git a/dygraph/seq2seq/infer.sh b/dygraph/seq2seq/infer.sh old mode 100755 new mode 100644 diff --git a/dygraph/seq2seq/reader.py b/dygraph/seq2seq/reader.py old mode 100755 new mode 100644 index 4f27560722a839c6bc51b3302fe1cc6b36064b65..c083b646a0de1c0c6be184a202b9aba11b2df986 --- a/dygraph/seq2seq/reader.py +++ b/dygraph/seq2seq/reader.py @@ -191,7 +191,6 @@ def get_data_iter(raw_data, new_cache = b_src else: new_cache = sorted(b_src, key=lambda k: len(k[0])) - for i in range(cache_num): batch_data = new_cache[i * batch_size:(i + 1) * batch_size] @@ -212,7 +211,7 @@ def get_data_iter(raw_data, for i in range(cache_num): batch_end = min(len(new_cache), (i + 1) * batch_size) - batch_data = new_cache[i * batch_size: batch_end] + batch_data = new_cache[i * batch_size:batch_end] src_cache = [w[0] for w in batch_data] tar_cache = [w[1] for w in batch_data] src_ids, src_mask = to_pad_np(src_cache, source=True) diff --git a/dygraph/seq2seq/run.sh b/dygraph/seq2seq/run.sh old mode 100755 new mode 100644 diff --git a/dygraph/seq2seq/train.py b/dygraph/seq2seq/train.py old mode 100755 new mode 100644 index 7ac862a6061ca171f75983be5a36192a32404a11..60ab8247c5e1dcdf3e4a7eb83380b9ce7e13469d --- a/dygraph/seq2seq/train.py +++ b/dygraph/seq2seq/train.py @@ -88,9 +88,14 @@ def main(): lr = args.learning_rate opt_type = args.optimizer if opt_type == "sgd": - optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip) + optimizer = fluid.optimizer.SGD(lr, + parameter_list=model.parameters(), + grad_clip=gloabl_norm_clip) elif opt_type == "adam": - optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip) + optimizer = fluid.optimizer.Adam( + lr, + parameter_list=model.parameters(), + grad_clip=gloabl_norm_clip) else: print("only support [sgd|adam]") raise Exception("opt type not support") @@ -103,8 +108,8 @@ def main(): tar_lang = args.tar_lang print("begin to load data") raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix, - train_data_prefix, eval_data_prefix, - test_data_prefix, args.max_len) + train_data_prefix, eval_data_prefix, + test_data_prefix, args.max_len) print("finished load data") train_data, valid_data, test_data, _ = raw_data @@ -128,8 +133,7 @@ def main(): total_loss = 0.0 word_count = 0.0 for batch_id, batch in enumerate(eval_data_iter): - input_data_feed, word_num = prepare_input( - batch, epoch_id) + input_data_feed, word_num = prepare_input(batch, epoch_id) loss = model(input_data_feed) total_loss += loss * batch_size @@ -142,8 +146,9 @@ def main(): ce_ppl = [] max_epoch = args.max_epoch for epoch_id in range(max_epoch): + epoch_start = time.time() + model.train() - start_time = time.time() if args.enable_ce: train_data_iter = reader.get_data_iter( train_data, batch_size, enable_ce=True) @@ -153,8 +158,11 @@ def main(): total_loss = 0 word_count = 0.0 batch_times = [] + + batch_start = time.time() for batch_id, batch in enumerate(train_data_iter): - batch_start_time = time.time() + batch_reader_end = time.time() + input_data_feed, word_num = prepare_input( batch, epoch_id=epoch_id) word_count += word_num @@ -164,28 +172,28 @@ def main(): optimizer.minimize(loss) model.clear_gradients() total_loss += loss * batch_size - batch_end_time = time.time() - batch_time = batch_end_time - batch_start_time - batch_times.append(batch_time) + train_batch_cost = time.time() - batch_start + batch_times.append(train_batch_cost) if batch_id > 0 and batch_id % 100 == 0: - print("-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f" % - (epoch_id, batch_id, batch_time, - np.exp(total_loss.numpy() / word_count))) + print( + "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, batch_cost: %.5f s, reader_cost: %.5f s" + % (epoch_id, batch_id, np.exp(total_loss.numpy() / + word_count), + train_batch_cost, batch_reader_end - batch_start)) ce_ppl.append(np.exp(total_loss.numpy() / word_count)) total_loss = 0.0 word_count = 0.0 + batch_start = time.time() - end_time = time.time() - epoch_time = end_time - start_time + train_epoch_cost = time.time() - epoch_start print( - "\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step\n" - % (epoch_id, epoch_time, sum(batch_times) / len(batch_times))) - ce_time.append(epoch_time) + "\nTrain epoch:[%d]; epoch_cost: %.5f s; avg_batch_cost: %.5f s/step\n" + % (epoch_id, train_epoch_cost, + sum(batch_times) / len(batch_times))) + ce_time.append(train_epoch_cost) - - dir_name = os.path.join(args.model_path, - "epoch_" + str(epoch_id)) + dir_name = os.path.join(args.model_path, "epoch_" + str(epoch_id)) print("begin to save", dir_name) paddle.fluid.save_dygraph(model.state_dict(), dir_name) print("save finished") diff --git a/dygraph/transformer/reader.py b/dygraph/transformer/reader.py index 0ac62a1b7d0e651f11ab18a40e165e5b496e10d6..3faf7f014dd9318a6eedae05a66c74c852894935 100644 --- a/dygraph/transformer/reader.py +++ b/dygraph/transformer/reader.py @@ -396,7 +396,9 @@ class DataProcessor(object): def _load_lines(self, fpattern, tar_fname): fpaths = glob.glob(fpattern) - assert len(fpaths) > 0, "no matching file to the provided data path" + assert len( + fpaths + ) > 0, "No matching file to the provided data path, for pattern %s." % fpattern if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]): if tar_fname is None: diff --git a/dygraph/transformer/train.py b/dygraph/transformer/train.py index 75cbb2772103a0467d833b2544eaf1f4cd4f2610..109de2625a5771219e5e5dbd44621ff87ef43a33 100644 --- a/dygraph/transformer/train.py +++ b/dygraph/transformer/train.py @@ -33,29 +33,30 @@ from model import Transformer, CrossEntropyCriterion, NoamDecay def do_train(args): if args.use_cuda: trainer_count = fluid.dygraph.parallel.Env().nranks - place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id - ) if trainer_count > 1 else fluid.CUDAPlace(0) + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( + ).dev_id) if trainer_count > 1 else fluid.CUDAPlace(0) else: trainer_count = 1 place = fluid.CPUPlace() # define the data generator - processor = reader.DataProcessor(fpattern=args.training_file, - src_vocab_fpath=args.src_vocab_fpath, - trg_vocab_fpath=args.trg_vocab_fpath, - token_delimiter=args.token_delimiter, - use_token_batch=args.use_token_batch, - batch_size=args.batch_size, - device_count=trainer_count, - pool_size=args.pool_size, - sort_type=args.sort_type, - shuffle=args.shuffle, - shuffle_batch=args.shuffle_batch, - start_mark=args.special_token[0], - end_mark=args.special_token[1], - unk_mark=args.special_token[2], - max_length=args.max_length, - n_head=args.n_head) + processor = reader.DataProcessor( + fpattern=args.training_file, + src_vocab_fpath=args.src_vocab_fpath, + trg_vocab_fpath=args.trg_vocab_fpath, + token_delimiter=args.token_delimiter, + use_token_batch=args.use_token_batch, + batch_size=args.batch_size, + device_count=trainer_count, + pool_size=args.pool_size, + sort_type=args.sort_type, + shuffle=args.shuffle, + shuffle_batch=args.shuffle_batch, + start_mark=args.special_token[0], + end_mark=args.special_token[1], + unk_mark=args.special_token[2], + max_length=args.max_length, + n_head=args.n_head) batch_generator = processor.data_generator(phase="train") if args.validation_file: val_processor = reader.DataProcessor( @@ -131,31 +132,30 @@ def do_train(args): if trainer_count > 1: strategy = fluid.dygraph.parallel.prepare_context() - transformer = fluid.dygraph.parallel.DataParallel( - transformer, strategy) + transformer = fluid.dygraph.parallel.DataParallel(transformer, + strategy) # the best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log( - (1. - args.label_smooth_eps)) + - args.label_smooth_eps * np.log(args.label_smooth_eps / - (args.trg_vocab_size - 1) + 1e-20)) + (1. - args.label_smooth_eps)) + args.label_smooth_eps * + np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) ce_time = [] ce_ppl = [] step_idx = 0 - #NOTE: used for benchmark - total_batch_num = 0 - # train loop for pass_id in range(args.epoch): - pass_start_time = time.time() + epoch_start = time.time() + batch_id = 0 + batch_start = time.time() for input_data in train_loader(): - if args.max_iter and total_batch_num == args.max_iter: #NOTE: used for benchmark + if args.max_iter and step_idx == args.max_iter: #NOTE: used for benchmark return - batch_start = time.time() + batch_reader_end = time.time() + (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight) = input_data @@ -163,8 +163,8 @@ def do_train(args): trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias) - sum_cost, avg_cost, token_num = criterion( - logits, lbl_word, lbl_weight) + sum_cost, avg_cost, token_num = criterion(logits, lbl_word, + lbl_weight) if trainer_count > 1: avg_cost = transformer.scale_loss(avg_cost) @@ -184,19 +184,19 @@ def do_train(args): "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, pass_id, batch_id, total_avg_cost, - total_avg_cost - loss_normalizer, - np.exp([min(total_avg_cost, 100)]))) - avg_batch_time = time.time() + total_avg_cost - loss_normalizer, + np.exp([min(total_avg_cost, 100)]))) else: + train_avg_batch_cost = args.print_step / ( + time.time() - batch_start) logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " - "normalized loss: %f, ppl: %f, speed: %.2f step/s" % - (step_idx, pass_id, batch_id, total_avg_cost, - total_avg_cost - loss_normalizer, - np.exp([min(total_avg_cost, 100)]), - args.print_step / (time.time() - avg_batch_time))) - avg_batch_time = time.time() - + "normalized loss: %f, ppl: %f, avg_speed: %.2f step/s" + % (step_idx, pass_id, batch_id, total_avg_cost, + total_avg_cost - loss_normalizer, + np.exp([min(total_avg_cost, 100)]), + train_avg_batch_cost)) + batch_start = time.time() if step_idx % args.save_step == 0 and step_idx != 0: # validation @@ -208,10 +208,9 @@ def do_train(args): (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight) = input_data - logits = transformer(src_word, src_pos, - src_slf_attn_bias, trg_word, - trg_pos, trg_slf_attn_bias, - trg_src_attn_bias) + logits = transformer( + src_word, src_pos, src_slf_attn_bias, trg_word, + trg_pos, trg_slf_attn_bias, trg_src_attn_bias) sum_cost, avg_cost, token_num = criterion( logits, lbl_word, lbl_weight) total_sum_cost += sum_cost.numpy() @@ -225,8 +224,8 @@ def do_train(args): transformer.train() if args.save_model and ( - trainer_count == 1 - or fluid.dygraph.parallel.Env().dev_id == 0): + trainer_count == 1 or + fluid.dygraph.parallel.Env().dev_id == 0): model_dir = os.path.join(args.save_model, "step_" + str(step_idx)) if not os.path.exists(model_dir): @@ -239,11 +238,12 @@ def do_train(args): os.path.join(model_dir, "transformer")) batch_id += 1 - total_batch_num = total_batch_num + 1 step_idx += 1 - time_consumed = time.time() - pass_start_time - ce_time.append(time_consumed) + train_epoch_cost = time.time() - epoch_start + ce_time.append(train_epoch_cost) + logging.info("train epoch: %d, epoch_cost: %.5f s" % + (pass_id, train_epoch_cost)) if args.save_model: model_dir = os.path.join(args.save_model, "step_final")