未验证 提交 5c6ddeda 编写于 作者: Y Yiqun Liu 提交者: GitHub

[dygraph] Polish the timing method and log of some dygraph models. (#4699)

上级 20d1e9bf
......@@ -47,8 +47,11 @@ def eval(net, test_data_loader, eop):
t_last = 0
place_num = paddle.fluid.core.get_cuda_device_count(
) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
batch_start = time.time()
for img, label in test_data_loader():
t1 = time.time()
batch_reader_end = time.time()
label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size // place_num), 1))
out = net(img)
......@@ -57,15 +60,19 @@ def eval(net, test_data_loader, eop):
avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t2 = time.time()
print( "test | epoch id: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec read_t:%2.4f" % \
(eop, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), t2 - t1 , t1 - t_last))
sys.stdout.flush()
total_loss += avg_loss.numpy()
total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy()
test_batch_cost = time.time() - batch_start
total_sample += 1
t_last = time.time()
print(
"test | epoch %d, avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
% (eop, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(),
test_batch_cost, batch_reader_end - batch_start))
sys.stdout.flush()
batch_start = time.time()
print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample))
......@@ -148,9 +155,12 @@ def train_mobilenet():
# 4. train loop
total_batch_num = 0 #this is for benchmark
for eop in range(args.num_epochs):
epoch_start = time.time()
if num_trainers > 1:
imagenet_reader.set_shuffle_seed(eop + (
args.random_seed if args.random_seed else 0))
net.train()
total_loss = 0.0
total_acc1 = 0.0
......@@ -158,24 +168,23 @@ def train_mobilenet():
total_sample = 0
batch_id = 0
t_last = 0
# 4.1 for each batch, call net() , backward(), and minimize()
batch_start = time.time()
for img, label in train_data_loader():
t1 = time.time()
if args.max_iter and total_batch_num == args.max_iter:
return
t_start = time.time()
batch_reader_end = time.time()
# 4.1.1 call net()
out = net(img)
t_end = time.time()
softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(
input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t_start_back = time.time()
batch_net_end = time.time()
# 4.1.2 call backward()
if args.use_data_parallel:
......@@ -184,37 +193,43 @@ def train_mobilenet():
net.apply_collective_grads()
else:
avg_loss.backward()
t_end_back = time.time()
batch_backward_end = time.time()
# 4.1.3 call minimize()
optimizer.minimize(avg_loss)
net.clear_gradients()
t2 = time.time()
train_batch_elapse = t2 - t1
if batch_id % args.print_step == 0:
print( "epoch id: %d, batch step: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec net_t:%2.4f back_t:%2.4f read_t:%2.4f" % \
(eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), train_batch_elapse,
t_end - t_start, t_end_back - t_start_back, t1 - t_last))
sys.stdout.flush()
total_loss += avg_loss.numpy()
total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy()
total_sample += 1
batch_id += 1
t_last = time.time()
# NOTE: used for benchmark
train_batch_cost = time.time() - batch_start
total_batch_num = total_batch_num + 1
if batch_id % args.print_step == 0:
print(
"[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s"
% (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(),
acc_top5.numpy(), train_batch_cost,
batch_net_end - batch_reader_end,
batch_backward_end - batch_net_end,
batch_reader_end - batch_start))
sys.stdout.flush()
batch_start = time.time()
if args.ce:
print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
(eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
train_epoch_cost = time.time() - epoch_start
print(
"[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s"
% (eop, total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample, train_epoch_cost))
# 4.2 save checkpoint
save_parameters = (not args.use_data_parallel) or (
......
......@@ -32,9 +32,6 @@ import time
from args import *
#import fluid.clip as clip
#from fluid.clip import *
import sys
if sys.version[0] == '2':
reload(sys)
......@@ -386,9 +383,11 @@ def train_ptb_lm():
ce_time = []
ce_ppl = []
total_batch_num = 0 #this is for benchmark
for epoch_id in range(max_epoch):
epoch_start = time.time()
ptb_model.train()
total_loss = 0.0
iters = 0.0
......@@ -405,11 +404,11 @@ def train_ptb_lm():
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
start_time = time.time()
batch_start = time.time()
for batch_id, batch in enumerate(train_data_loader):
if args.max_iter and total_batch_num == args.max_iter:
return
batch_start = time.time()
x, y = batch
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
......@@ -423,23 +422,26 @@ def train_ptb_lm():
ptb_model.clear_gradients()
total_loss += out_loss
batch_end = time.time()
train_batch_cost = batch_end - batch_start
iters += num_steps
total_batch_num = total_batch_num + 1 #this is for benchmark
total_batch_num = total_batch_num + 1 #this is for benchmark
train_batch_cost = time.time() - batch_start
if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters)
print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" %
(epoch_id, batch_id, ppl[0],
sgd._global_learning_rate().numpy(), out_loss, train_batch_cost))
print(
"-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s"
% (epoch_id, batch_id, ppl[0],
sgd._global_learning_rate().numpy(), out_loss,
train_batch_cost))
batch_start = time.time()
print("one epoch finished", epoch_id)
print("time cost ", time.time() - start_time)
ppl = np.exp(total_loss / iters)
ce_time.append(time.time() - start_time)
train_epoch_cost = time.time() - epoch_start
print("-- Epoch:[%d]; ppl: %.5f, epoch_cost: %.5f s" %
(epoch_id, ppl[0], train_epoch_cost))
ce_time.append(train_epoch_cost)
ce_ppl.append(ppl[0])
print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
# for bad init, after first epoch, the loss is over 1000
......
......@@ -190,7 +190,10 @@ class ImageNetReader:
full_lines = [line.strip() for line in flist]
if mode != "test" and len(full_lines) < settings.batch_size:
print(
"Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!"
"Warning: The number of the whole data ({}) is smaller "
"than the batch_size ({}), and drop_last is turnning on, "
"so nothing will feed in program, Terminated now. Please "
"reset batch_size to a smaller number or feed more data!"
.format(len(full_lines), settings.batch_size))
os._exit(1)
if shuffle:
......
......@@ -331,8 +331,6 @@ def eval(model, data):
label.stop_gradient = True
out = model(img)
#loss = fluid.layers.cross_entropy(input=out, label=label)
#avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
......@@ -344,7 +342,6 @@ def eval(model, data):
total_acc5 += acc_top5.numpy()
total_sample += 1
# print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
if batch_id % 10 == 0:
print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \
( batch_id, total_acc1 / total_sample, total_acc5 / total_sample))
......@@ -413,13 +410,11 @@ def train_resnet():
use_multiprocess=True)
test_loader.set_sample_list_generator(test_reader, places=place)
#file_name = './model/epoch_0.npz'
#model_data = np.load( file_name )
#NOTE: used in benchmark
total_batch_num = 0
for eop in range(epoch):
epoch_start = time.time()
resnet.train()
total_loss = 0.0
......@@ -427,17 +422,13 @@ def train_resnet():
total_acc5 = 0.0
total_sample = 0
#dict_state = resnet.state_dict()
#resnet.load_dict( model_data )
print("load finished")
batch_start = time.time()
for batch_id, data in enumerate(train_loader()):
#NOTE: used in benchmark
if args.max_iter and total_batch_num == args.max_iter:
return
batch_start = time.time()
train_reader_cost = time.time() - batch_start
img, label = data
label.stop_gradient = True
......@@ -461,26 +452,32 @@ def train_resnet():
optimizer.minimize(avg_loss)
resnet.clear_gradients()
batch_end = time.time()
train_batch_cost = batch_end - batch_start
total_loss += dy_out
total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy()
total_sample += 1
train_batch_cost = time.time() - batch_start
total_batch_num = total_batch_num + 1 #this is for benchmark
#print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
if batch_id % 10 == 0:
print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f, batch cost: %.5f" % \
( eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_cost))
print(
"[Epoch %d, batch %d] loss %.5f, acc1 %.5f, acc5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
% (eop, batch_id, total_loss / total_sample,
total_acc1 / total_sample, total_acc5 / total_sample,
train_batch_cost, train_reader_cost))
batch_start = time.time()
if args.ce:
print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample))
train_epoch_cost = time.time() - epoch_start
print(
"[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s"
% (eop, total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample, train_epoch_cost))
resnet.eval()
eval(resnet, test_loader)
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
......@@ -30,6 +30,7 @@ alpha = 0.6
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
zero_constant = fluid.initializer.Constant(0.0)
class AttentionModel(fluid.dygraph.Layer):
def __init__(self,
hidden_size,
......@@ -79,55 +80,61 @@ class AttentionModel(fluid.dygraph.Layer):
self.enc_units = []
for i in range(num_layers):
self.enc_units.append(
self.add_sublayer("enc_units_%d" % i,
self.add_sublayer(
"enc_units_%d" % i,
BasicLSTMUnit(
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
self.dec_units = []
for i in range(num_layers):
if i == 0:
self.dec_units.append(
self.add_sublayer("dec_units_%d" % i,
self.add_sublayer(
"dec_units_%d" % i,
BasicLSTMUnit(
hidden_size=self.hidden_size,
input_size=self.hidden_size * 2,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
hidden_size=self.hidden_size,
input_size=self.hidden_size * 2,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
else:
self.dec_units.append(
self.add_sublayer("dec_units_%d" % i,
BasicLSTMUnit(
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
self.fc = fluid.dygraph.nn.Linear(self.hidden_size,
self.tar_vocab_size,
param_attr=param_attr,
bias_attr=False)
self.attn_fc = fluid.dygraph.nn.Linear(self.hidden_size,
self.hidden_size,
param_attr=param_attr,
bias_attr=False)
self.concat_fc = fluid.dygraph.nn.Linear(2 * self.hidden_size,
self.hidden_size,
param_attr=param_attr,
bias_attr=False)
self.add_sublayer(
"dec_units_%d" % i,
BasicLSTMUnit(
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
self.fc = fluid.dygraph.nn.Linear(
self.hidden_size,
self.tar_vocab_size,
param_attr=param_attr,
bias_attr=False)
self.attn_fc = fluid.dygraph.nn.Linear(
self.hidden_size,
self.hidden_size,
param_attr=param_attr,
bias_attr=False)
self.concat_fc = fluid.dygraph.nn.Linear(
2 * self.hidden_size,
self.hidden_size,
param_attr=param_attr,
bias_attr=False)
def _transpose_batch_time(self, x):
return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
def _merge_batch_beams(self, x):
return fluid.layers.reshape(x, shape=(-1,x.shape[2]))
return fluid.layers.reshape(x, shape=(-1, x.shape[2]))
def tile_beam_merge_with_batch(self, x):
x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...]
......@@ -135,7 +142,7 @@ class AttentionModel(fluid.dygraph.Layer):
expand_times[1] = self.beam_size
x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...]
x = fluid.layers.transpose(x, list(range(2, len(x.shape))) +
[0, 1]) # [..., batch_size, beam_size]
[0, 1]) # [..., batch_size, beam_size]
# use 0 to copy to avoid wrong shape
x = fluid.layers.reshape(
x, shape=[0] *
......@@ -159,7 +166,7 @@ class AttentionModel(fluid.dygraph.Layer):
new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \
fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0)
return new_state
def _gather(self, x, indices, batch_pos):
topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
return fluid.layers.gather_nd(x, topk_coordinates)
......@@ -184,12 +191,19 @@ class AttentionModel(fluid.dygraph.Layer):
if src.shape[0] < self.batch_size:
self.batch_size = src.shape[0]
src_emb = self.src_embeder(self._transpose_batch_time(src))
enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32'))
enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32'))
enc_hidden = to_variable(
np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
enc_cell = to_variable(
np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
max_seq_len = src_emb.shape[0]
enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32")
enc_len_mask = fluid.layers.sequence_mask(
src_sequence_length, maxlen=max_seq_len, dtype="float32")
enc_padding_mask = (enc_len_mask - 1.0)
enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
enc_states = [[enc_hidden, enc_cell]]
......@@ -200,7 +214,8 @@ class AttentionModel(fluid.dygraph.Layer):
enc_hidden, enc_cell = enc_states[l]
new_enc_hidden, new_enc_cell = [], []
for i in range(self.num_layers):
new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i])
new_hidden, new_cell = self.enc_units[i](
step_input, enc_hidden[i], enc_cell[i])
new_enc_hidden.append(new_hidden)
new_enc_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0:
......@@ -210,17 +225,25 @@ class AttentionModel(fluid.dygraph.Layer):
dropout_implementation='upscale_in_train')
else:
step_input = new_hidden
new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)]
new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)]
new_enc_hidden = [
self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask)
for i in range(self.num_layers)
]
new_enc_cell = [
self._real_state(enc_cell[i], new_enc_cell[i], step_mask)
for i in range(self.num_layers)
]
enc_states.append([new_enc_hidden, new_enc_cell])
enc_outputs.append(step_input)
enc_outputs = fluid.layers.stack(enc_outputs)
enc_outputs = self._transpose_batch_time(enc_outputs)
if self.mode in ['train', 'eval']:
if self.mode in ['train', 'eval']:
# calculation with input_feed derives from paper: https://arxiv.org/pdf/1508.04025.pdf
input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
input_feed = to_variable(
np.zeros(
(self.batch_size, self.hidden_size), dtype='float32'))
dec_hidden, dec_cell = enc_states[-1]
tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
max_seq_len = tar_emb.shape[0]
......@@ -231,7 +254,8 @@ class AttentionModel(fluid.dygraph.Layer):
step_input = fluid.layers.concat([step_input, input_feed], 1)
new_dec_hidden, new_dec_cell = [], []
for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i])
new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell)
......@@ -243,7 +267,8 @@ class AttentionModel(fluid.dygraph.Layer):
else:
step_input = new_hidden
dec_att = self.attention(step_input, enc_outputs, enc_padding_mask)
dec_att = self.attention(step_input, enc_outputs,
enc_padding_mask)
dec_att = fluid.layers.squeeze(dec_att, [1])
concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
out = self.concat_fc(concat_att_out)
......@@ -253,9 +278,9 @@ class AttentionModel(fluid.dygraph.Layer):
dec_output = fluid.layers.stack(dec_output)
dec_output = self.fc(self._transpose_batch_time(dec_output))
loss = fluid.layers.softmax_with_cross_entropy(
logits=dec_output, label=label, soft_label=False)
logits=dec_output, label=label, soft_label=False)
loss = fluid.layers.squeeze(loss, axes=[2])
max_tar_seq_len = fluid.layers.shape(tar)[1]
tar_mask = fluid.layers.sequence_mask(
......@@ -264,28 +289,45 @@ class AttentionModel(fluid.dygraph.Layer):
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
return loss
elif self.mode in ['beam_search']:
enc_outputs = self.tile_beam_merge_with_batch(enc_outputs)
enc_padding_mask = self.tile_beam_merge_with_batch(enc_padding_mask)
batch_beam_shape = (self.batch_size, self.beam_size)
vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size))
start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64'))
end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64'))
start_token_tensor = to_variable(
np.full(
batch_beam_shape, self.beam_start_token, dtype='int64'))
end_token_tensor = to_variable(
np.full(
batch_beam_shape, self.beam_end_token, dtype='int64'))
step_input = self.tar_embeder(start_token_tensor)
input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
input_feed = to_variable(
np.zeros(
(self.batch_size, self.hidden_size), dtype='float32'))
input_feed = self._expand_to_beam_size(input_feed)
input_feed = self._merge_batch_beams(input_feed)
beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32'))
beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32"))
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1])
beam_finished = to_variable(
np.full(
batch_beam_shape, 0, dtype='float32'))
beam_state_log_probs = to_variable(
np.array(
[[0.] + [-self.kinf] * (self.beam_size - 1)],
dtype="float32"))
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs,
[self.batch_size, 1])
dec_hidden, dec_cell = enc_states[-1]
dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden]
dec_hidden = [
self._expand_to_beam_size(state) for state in dec_hidden
]
dec_cell = [self._expand_to_beam_size(state) for state in dec_cell]
batch_pos = fluid.layers.expand(
fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]),
fluid.layers.unsqueeze(
to_variable(
np.arange(
0, self.batch_size, 1, dtype="int64")), [1]),
[1, self.beam_size])
predicted_ids = []
parent_ids = []
......@@ -296,11 +338,16 @@ class AttentionModel(fluid.dygraph.Layer):
step_input = self._merge_batch_beams(step_input)
step_input = fluid.layers.concat([step_input, input_feed], 1)
new_dec_hidden, new_dec_cell = [], []
dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden]
dec_cell = [self._merge_batch_beams(state) for state in dec_cell]
dec_hidden = [
self._merge_batch_beams(state) for state in dec_hidden
]
dec_cell = [
self._merge_batch_beams(state) for state in dec_cell
]
for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i])
new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0:
......@@ -310,17 +357,23 @@ class AttentionModel(fluid.dygraph.Layer):
dropout_implementation='upscale_in_train')
else:
step_input = new_hidden
dec_att = self.attention(step_input, enc_outputs, enc_padding_mask)
dec_att = self.attention(step_input, enc_outputs,
enc_padding_mask)
dec_att = fluid.layers.squeeze(dec_att, [1])
concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
out = self.concat_fc(concat_att_out)
input_feed = out
cell_outputs = self._split_batch_beams(out)
cell_outputs = self.fc(cell_outputs)
step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs))
step_log_probs = fluid.layers.log(
fluid.layers.softmax(cell_outputs))
noend_array = [-self.kinf] * self.tar_vocab_size
noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...]
noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32'))
noend_array[
self.
beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...]
noend_mask_tensor = to_variable(
np.array(
noend_array, dtype='float32'))
# set finished position to one-hot probability of <eos>
step_log_probs = fluid.layers.elementwise_mul(
fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]),
......@@ -328,20 +381,38 @@ class AttentionModel(fluid.dygraph.Layer):
fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0)
log_probs = fluid.layers.elementwise_add(
x=step_log_probs, y=beam_state_log_probs, axis=0)
scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size])
topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size)
beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam
token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam
next_log_probs = self._gather(scores, topk_indices, batch_pos) #
new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden]
new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell]
new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden]
new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell]
next_finished = self._gather(beam_finished, beam_indices, batch_pos)
scores = fluid.layers.reshape(
log_probs, [-1, self.beam_size * self.tar_vocab_size])
topk_scores, topk_indices = fluid.layers.topk(
input=scores, k=self.beam_size)
beam_indices = fluid.layers.elementwise_floordiv(
topk_indices, vocab_size_tensor) # in which beam
token_indices = fluid.layers.elementwise_mod(
topk_indices, vocab_size_tensor) # position in beam
next_log_probs = self._gather(scores, topk_indices,
batch_pos) #
new_dec_hidden = [
self._split_batch_beams(state) for state in new_dec_hidden
]
new_dec_cell = [
self._split_batch_beams(state) for state in new_dec_cell
]
new_dec_hidden = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_hidden
]
new_dec_cell = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_cell
]
next_finished = self._gather(beam_finished, beam_indices,
batch_pos)
next_finished = fluid.layers.cast(next_finished, "bool")
next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor))
next_finished = fluid.layers.logical_or(
next_finished,
fluid.layers.equal(token_indices, end_token_tensor))
next_finished = fluid.layers.cast(next_finished, "float32")
# prepare for next step
dec_hidden, dec_cell = new_dec_hidden, new_dec_cell
......
......@@ -30,6 +30,7 @@ alpha = 0.6
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
zero_constant = fluid.initializer.Constant(0.0)
class BaseModel(fluid.dygraph.Layer):
def __init__(self,
hidden_size,
......@@ -77,35 +78,38 @@ class BaseModel(fluid.dygraph.Layer):
self.enc_units = []
for i in range(num_layers):
self.enc_units.append(
self.add_sublayer("enc_units_%d" % i,
self.add_sublayer(
"enc_units_%d" % i,
BasicLSTMUnit(
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
self.dec_units = []
for i in range(num_layers):
self.dec_units.append(
self.add_sublayer("dec_units_%d" % i,
self.add_sublayer(
"dec_units_%d" % i,
BasicLSTMUnit(
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
self.fc = fluid.dygraph.nn.Linear(self.hidden_size,
self.tar_vocab_size,
param_attr=param_attr,
bias_attr=False)
hidden_size=self.hidden_size,
input_size=self.hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
forget_bias=forget_bias)))
self.fc = fluid.dygraph.nn.Linear(
self.hidden_size,
self.tar_vocab_size,
param_attr=param_attr,
bias_attr=False)
def _transpose_batch_time(self, x):
return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
def _merge_batch_beams(self, x):
return fluid.layers.reshape(x, shape=(-1,x.shape[2]))
return fluid.layers.reshape(x, shape=(-1, x.shape[2]))
def _split_batch_beams(self, x):
return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1]))
......@@ -135,11 +139,18 @@ class BaseModel(fluid.dygraph.Layer):
self.batch_size = src.shape[0]
src_emb = self.src_embeder(self._transpose_batch_time(src))
enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32'))
enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32'))
enc_hidden = to_variable(
np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
enc_cell = to_variable(
np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
max_seq_len = src_emb.shape[0]
enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32")
enc_len_mask = fluid.layers.sequence_mask(
src_sequence_length, maxlen=max_seq_len, dtype="float32")
enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
enc_states = [[enc_hidden, enc_cell]]
for l in range(max_seq_len):
......@@ -148,7 +159,8 @@ class BaseModel(fluid.dygraph.Layer):
enc_hidden, enc_cell = enc_states[l]
new_enc_hidden, new_enc_cell = [], []
for i in range(self.num_layers):
new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i])
new_hidden, new_cell = self.enc_units[i](
step_input, enc_hidden[i], enc_cell[i])
new_enc_hidden.append(new_hidden)
new_enc_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0:
......@@ -158,10 +170,16 @@ class BaseModel(fluid.dygraph.Layer):
dropout_implementation='upscale_in_train')
else:
step_input = new_hidden
new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)]
new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)]
new_enc_hidden = [
self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask)
for i in range(self.num_layers)
]
new_enc_cell = [
self._real_state(enc_cell[i], new_enc_cell[i], step_mask)
for i in range(self.num_layers)
]
enc_states.append([new_enc_hidden, new_enc_cell])
if self.mode in ['train', 'eval']:
dec_hidden, dec_cell = enc_states[-1]
tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
......@@ -172,7 +190,8 @@ class BaseModel(fluid.dygraph.Layer):
step_input = tar_emb[step_idx]
new_dec_hidden, new_dec_cell = [], []
for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i])
new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0:
......@@ -187,9 +206,9 @@ class BaseModel(fluid.dygraph.Layer):
dec_output = fluid.layers.stack(dec_output)
dec_output = self.fc(self._transpose_batch_time(dec_output))
loss = fluid.layers.softmax_with_cross_entropy(
logits=dec_output, label=label, soft_label=False)
logits=dec_output, label=label, soft_label=False)
loss = fluid.layers.squeeze(loss, axes=[2])
max_tar_seq_len = fluid.layers.shape(tar)[1]
tar_mask = fluid.layers.sequence_mask(
......@@ -202,19 +221,34 @@ class BaseModel(fluid.dygraph.Layer):
batch_beam_shape = (self.batch_size, self.beam_size)
#batch_beam_shape_1 = (self.batch_size, self.beam_size, 1)
vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size))
start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64'))
end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64'))
start_token_tensor = to_variable(
np.full(
batch_beam_shape, self.beam_start_token, dtype='int64'))
end_token_tensor = to_variable(
np.full(
batch_beam_shape, self.beam_end_token, dtype='int64'))
step_input = self.tar_embeder(start_token_tensor)
beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32'))
beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32"))
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1])
beam_finished = to_variable(
np.full(
batch_beam_shape, 0, dtype='float32'))
beam_state_log_probs = to_variable(
np.array(
[[0.] + [-self.kinf] * (self.beam_size - 1)],
dtype="float32"))
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs,
[self.batch_size, 1])
dec_hidden, dec_cell = enc_states[-1]
dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden]
dec_hidden = [
self._expand_to_beam_size(state) for state in dec_hidden
]
dec_cell = [self._expand_to_beam_size(state) for state in dec_cell]
batch_pos = fluid.layers.expand(
fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]),
fluid.layers.unsqueeze(
to_variable(
np.arange(
0, self.batch_size, 1, dtype="int64")), [1]),
[1, self.beam_size])
predicted_ids = []
parent_ids = []
......@@ -224,11 +258,16 @@ class BaseModel(fluid.dygraph.Layer):
break
step_input = self._merge_batch_beams(step_input)
new_dec_hidden, new_dec_cell = [], []
dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden]
dec_cell = [self._merge_batch_beams(state) for state in dec_cell]
dec_hidden = [
self._merge_batch_beams(state) for state in dec_hidden
]
dec_cell = [
self._merge_batch_beams(state) for state in dec_cell
]
for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i])
new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0:
......@@ -239,12 +278,17 @@ class BaseModel(fluid.dygraph.Layer):
else:
step_input = new_hidden
cell_outputs = self._split_batch_beams(step_input)
cell_outputs = self.fc(cell_outputs)
cell_outputs = self.fc(cell_outputs)
# Beam_search_step:
step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs))
step_log_probs = fluid.layers.log(
fluid.layers.softmax(cell_outputs))
noend_array = [-self.kinf] * self.tar_vocab_size
noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...]
noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32'))
noend_array[
self.
beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...]
noend_mask_tensor = to_variable(
np.array(
noend_array, dtype='float32'))
# set finished position to one-hot probability of <eos>
step_log_probs = fluid.layers.elementwise_mul(
fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]),
......@@ -252,26 +296,45 @@ class BaseModel(fluid.dygraph.Layer):
fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0)
log_probs = fluid.layers.elementwise_add(
x=step_log_probs, y=beam_state_log_probs, axis=0)
scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size])
topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size)
beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam
token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam
next_log_probs = self._gather(scores, topk_indices, batch_pos) #
scores = fluid.layers.reshape(
log_probs, [-1, self.beam_size * self.tar_vocab_size])
topk_scores, topk_indices = fluid.layers.topk(
input=scores, k=self.beam_size)
beam_indices = fluid.layers.elementwise_floordiv(
topk_indices, vocab_size_tensor) # in which beam
token_indices = fluid.layers.elementwise_mod(
topk_indices, vocab_size_tensor) # position in beam
next_log_probs = self._gather(scores, topk_indices,
batch_pos) #
new_dec_hidden = [
self._split_batch_beams(state) for state in new_dec_hidden
]
new_dec_cell = [
self._split_batch_beams(state) for state in new_dec_cell
]
new_dec_hidden = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_hidden
]
new_dec_cell = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_cell
]
new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden]
new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell]
new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden]
new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell]
next_finished = self._gather(beam_finished, beam_indices, batch_pos)
next_finished = self._gather(beam_finished, beam_indices,
batch_pos)
next_finished = fluid.layers.cast(next_finished, "bool")
next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor))
next_finished = fluid.layers.logical_or(
next_finished,
fluid.layers.equal(token_indices, end_token_tensor))
next_finished = fluid.layers.cast(next_finished, "float32")
# prepare for next step
dec_hidden, dec_cell = new_dec_hidden, new_dec_cell
beam_finished = next_finished
beam_state_log_probs = next_log_probs
step_input = self.tar_embeder(token_indices) # remove unsqueeze in v1.7
step_input = self.tar_embeder(
token_indices) # remove unsqueeze in v1.7
predicted_ids.append(token_indices)
parent_ids.append(beam_indices)
......
文件模式从 100755 更改为 100644
......@@ -74,7 +74,7 @@ def infer():
src_vocab_size,
tar_vocab_size,
batch_size,
beam_size = args.beam_size,
beam_size=args.beam_size,
num_layers=num_layers,
init_scale=init_scale,
dropout=0.0,
......@@ -85,7 +85,7 @@ def infer():
src_vocab_size,
tar_vocab_size,
batch_size,
beam_size = args.beam_size,
beam_size=args.beam_size,
num_layers=num_layers,
init_scale=init_scale,
dropout=0.0,
......@@ -97,17 +97,17 @@ def infer():
infer_data = reader.raw_mono_data(source_vocab_file, infer_file)
def prepare_input(batch, epoch_id=0):
src_ids, src_mask, tar_ids, tar_mask = batch
res = {}
src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1]))
in_tar = tar_ids[:, :-1]
label_tar = tar_ids[:, 1:]
in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1]))
label_tar = label_tar.reshape(
(label_tar.shape[0], label_tar.shape[1], 1))
inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask]
return inputs, np.sum(tar_mask)
src_ids, src_mask, tar_ids, tar_mask = batch
res = {}
src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1]))
in_tar = tar_ids[:, :-1]
label_tar = tar_ids[:, 1:]
in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1]))
label_tar = label_tar.reshape(
(label_tar.shape[0], label_tar.shape[1], 1))
inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask]
return inputs, np.sum(tar_mask)
dir_name = args.reload_model
print("dir name", dir_name)
......@@ -115,7 +115,8 @@ def infer():
model.set_dict(state_dict)
model.eval()
train_data_iter = reader.get_data_iter(infer_data, batch_size, mode='infer')
train_data_iter = reader.get_data_iter(
infer_data, batch_size, mode='infer')
tar_id2vocab = []
tar_vocab_file = args.vocab_prefix + "." + args.tar_lang
......
文件模式从 100755 更改为 100644
......@@ -191,7 +191,6 @@ def get_data_iter(raw_data,
new_cache = b_src
else:
new_cache = sorted(b_src, key=lambda k: len(k[0]))
for i in range(cache_num):
batch_data = new_cache[i * batch_size:(i + 1) * batch_size]
......@@ -212,7 +211,7 @@ def get_data_iter(raw_data,
for i in range(cache_num):
batch_end = min(len(new_cache), (i + 1) * batch_size)
batch_data = new_cache[i * batch_size: batch_end]
batch_data = new_cache[i * batch_size:batch_end]
src_cache = [w[0] for w in batch_data]
tar_cache = [w[1] for w in batch_data]
src_ids, src_mask = to_pad_np(src_cache, source=True)
......
文件模式从 100755 更改为 100644
......@@ -88,9 +88,14 @@ def main():
lr = args.learning_rate
opt_type = args.optimizer
if opt_type == "sgd":
optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip)
optimizer = fluid.optimizer.SGD(lr,
parameter_list=model.parameters(),
grad_clip=gloabl_norm_clip)
elif opt_type == "adam":
optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip)
optimizer = fluid.optimizer.Adam(
lr,
parameter_list=model.parameters(),
grad_clip=gloabl_norm_clip)
else:
print("only support [sgd|adam]")
raise Exception("opt type not support")
......@@ -103,8 +108,8 @@ def main():
tar_lang = args.tar_lang
print("begin to load data")
raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix,
train_data_prefix, eval_data_prefix,
test_data_prefix, args.max_len)
train_data_prefix, eval_data_prefix,
test_data_prefix, args.max_len)
print("finished load data")
train_data, valid_data, test_data, _ = raw_data
......@@ -128,8 +133,7 @@ def main():
total_loss = 0.0
word_count = 0.0
for batch_id, batch in enumerate(eval_data_iter):
input_data_feed, word_num = prepare_input(
batch, epoch_id)
input_data_feed, word_num = prepare_input(batch, epoch_id)
loss = model(input_data_feed)
total_loss += loss * batch_size
......@@ -142,8 +146,9 @@ def main():
ce_ppl = []
max_epoch = args.max_epoch
for epoch_id in range(max_epoch):
epoch_start = time.time()
model.train()
start_time = time.time()
if args.enable_ce:
train_data_iter = reader.get_data_iter(
train_data, batch_size, enable_ce=True)
......@@ -153,8 +158,11 @@ def main():
total_loss = 0
word_count = 0.0
batch_times = []
batch_start = time.time()
for batch_id, batch in enumerate(train_data_iter):
batch_start_time = time.time()
batch_reader_end = time.time()
input_data_feed, word_num = prepare_input(
batch, epoch_id=epoch_id)
word_count += word_num
......@@ -164,28 +172,28 @@ def main():
optimizer.minimize(loss)
model.clear_gradients()
total_loss += loss * batch_size
batch_end_time = time.time()
batch_time = batch_end_time - batch_start_time
batch_times.append(batch_time)
train_batch_cost = time.time() - batch_start
batch_times.append(train_batch_cost)
if batch_id > 0 and batch_id % 100 == 0:
print("-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f" %
(epoch_id, batch_id, batch_time,
np.exp(total_loss.numpy() / word_count)))
print(
"-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
% (epoch_id, batch_id, np.exp(total_loss.numpy() /
word_count),
train_batch_cost, batch_reader_end - batch_start))
ce_ppl.append(np.exp(total_loss.numpy() / word_count))
total_loss = 0.0
word_count = 0.0
batch_start = time.time()
end_time = time.time()
epoch_time = end_time - start_time
train_epoch_cost = time.time() - epoch_start
print(
"\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step\n"
% (epoch_id, epoch_time, sum(batch_times) / len(batch_times)))
ce_time.append(epoch_time)
"\nTrain epoch:[%d]; epoch_cost: %.5f s; avg_batch_cost: %.5f s/step\n"
% (epoch_id, train_epoch_cost,
sum(batch_times) / len(batch_times)))
ce_time.append(train_epoch_cost)
dir_name = os.path.join(args.model_path,
"epoch_" + str(epoch_id))
dir_name = os.path.join(args.model_path, "epoch_" + str(epoch_id))
print("begin to save", dir_name)
paddle.fluid.save_dygraph(model.state_dict(), dir_name)
print("save finished")
......
......@@ -396,7 +396,9 @@ class DataProcessor(object):
def _load_lines(self, fpattern, tar_fname):
fpaths = glob.glob(fpattern)
assert len(fpaths) > 0, "no matching file to the provided data path"
assert len(
fpaths
) > 0, "No matching file to the provided data path, for pattern %s." % fpattern
if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
if tar_fname is None:
......
......@@ -33,29 +33,30 @@ from model import Transformer, CrossEntropyCriterion, NoamDecay
def do_train(args):
if args.use_cuda:
trainer_count = fluid.dygraph.parallel.Env().nranks
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id
) if trainer_count > 1 else fluid.CUDAPlace(0)
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if trainer_count > 1 else fluid.CUDAPlace(0)
else:
trainer_count = 1
place = fluid.CPUPlace()
# define the data generator
processor = reader.DataProcessor(fpattern=args.training_file,
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
token_delimiter=args.token_delimiter,
use_token_batch=args.use_token_batch,
batch_size=args.batch_size,
device_count=trainer_count,
pool_size=args.pool_size,
sort_type=args.sort_type,
shuffle=args.shuffle,
shuffle_batch=args.shuffle_batch,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
max_length=args.max_length,
n_head=args.n_head)
processor = reader.DataProcessor(
fpattern=args.training_file,
src_vocab_fpath=args.src_vocab_fpath,
trg_vocab_fpath=args.trg_vocab_fpath,
token_delimiter=args.token_delimiter,
use_token_batch=args.use_token_batch,
batch_size=args.batch_size,
device_count=trainer_count,
pool_size=args.pool_size,
sort_type=args.sort_type,
shuffle=args.shuffle,
shuffle_batch=args.shuffle_batch,
start_mark=args.special_token[0],
end_mark=args.special_token[1],
unk_mark=args.special_token[2],
max_length=args.max_length,
n_head=args.n_head)
batch_generator = processor.data_generator(phase="train")
if args.validation_file:
val_processor = reader.DataProcessor(
......@@ -131,31 +132,30 @@ def do_train(args):
if trainer_count > 1:
strategy = fluid.dygraph.parallel.prepare_context()
transformer = fluid.dygraph.parallel.DataParallel(
transformer, strategy)
transformer = fluid.dygraph.parallel.DataParallel(transformer,
strategy)
# the best cross-entropy value with label smoothing
loss_normalizer = -(
(1. - args.label_smooth_eps) * np.log(
(1. - args.label_smooth_eps)) +
args.label_smooth_eps * np.log(args.label_smooth_eps /
(args.trg_vocab_size - 1) + 1e-20))
(1. - args.label_smooth_eps)) + args.label_smooth_eps *
np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
ce_time = []
ce_ppl = []
step_idx = 0
#NOTE: used for benchmark
total_batch_num = 0
# train loop
for pass_id in range(args.epoch):
pass_start_time = time.time()
epoch_start = time.time()
batch_id = 0
batch_start = time.time()
for input_data in train_loader():
if args.max_iter and total_batch_num == args.max_iter: #NOTE: used for benchmark
if args.max_iter and step_idx == args.max_iter: #NOTE: used for benchmark
return
batch_start = time.time()
batch_reader_end = time.time()
(src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word,
lbl_weight) = input_data
......@@ -163,8 +163,8 @@ def do_train(args):
trg_word, trg_pos, trg_slf_attn_bias,
trg_src_attn_bias)
sum_cost, avg_cost, token_num = criterion(
logits, lbl_word, lbl_weight)
sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
lbl_weight)
if trainer_count > 1:
avg_cost = transformer.scale_loss(avg_cost)
......@@ -184,19 +184,19 @@ def do_train(args):
"step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
"normalized loss: %f, ppl: %f" %
(step_idx, pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)])))
avg_batch_time = time.time()
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)])))
else:
train_avg_batch_cost = args.print_step / (
time.time() - batch_start)
logging.info(
"step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
"normalized loss: %f, ppl: %f, speed: %.2f step/s" %
(step_idx, pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)]),
args.print_step / (time.time() - avg_batch_time)))
avg_batch_time = time.time()
"normalized loss: %f, ppl: %f, avg_speed: %.2f step/s"
% (step_idx, pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)]),
train_avg_batch_cost))
batch_start = time.time()
if step_idx % args.save_step == 0 and step_idx != 0:
# validation
......@@ -208,10 +208,9 @@ def do_train(args):
(src_word, src_pos, src_slf_attn_bias, trg_word,
trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
lbl_word, lbl_weight) = input_data
logits = transformer(src_word, src_pos,
src_slf_attn_bias, trg_word,
trg_pos, trg_slf_attn_bias,
trg_src_attn_bias)
logits = transformer(
src_word, src_pos, src_slf_attn_bias, trg_word,
trg_pos, trg_slf_attn_bias, trg_src_attn_bias)
sum_cost, avg_cost, token_num = criterion(
logits, lbl_word, lbl_weight)
total_sum_cost += sum_cost.numpy()
......@@ -225,8 +224,8 @@ def do_train(args):
transformer.train()
if args.save_model and (
trainer_count == 1
or fluid.dygraph.parallel.Env().dev_id == 0):
trainer_count == 1 or
fluid.dygraph.parallel.Env().dev_id == 0):
model_dir = os.path.join(args.save_model,
"step_" + str(step_idx))
if not os.path.exists(model_dir):
......@@ -239,11 +238,12 @@ def do_train(args):
os.path.join(model_dir, "transformer"))
batch_id += 1
total_batch_num = total_batch_num + 1
step_idx += 1
time_consumed = time.time() - pass_start_time
ce_time.append(time_consumed)
train_epoch_cost = time.time() - epoch_start
ce_time.append(train_epoch_cost)
logging.info("train epoch: %d, epoch_cost: %.5f s" %
(pass_id, train_epoch_cost))
if args.save_model:
model_dir = os.path.join(args.save_model, "step_final")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册