未验证 提交 5c6ddeda 编写于 作者: Y Yiqun Liu 提交者: GitHub

[dygraph] Polish the timing method and log of some dygraph models. (#4699)

上级 20d1e9bf
...@@ -47,8 +47,11 @@ def eval(net, test_data_loader, eop): ...@@ -47,8 +47,11 @@ def eval(net, test_data_loader, eop):
t_last = 0 t_last = 0
place_num = paddle.fluid.core.get_cuda_device_count( place_num = paddle.fluid.core.get_cuda_device_count(
) if args.use_gpu else int(os.environ.get('CPU_NUM', 1)) ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1))
batch_start = time.time()
for img, label in test_data_loader(): for img, label in test_data_loader():
t1 = time.time() batch_reader_end = time.time()
label = to_variable(label.numpy().astype('int64').reshape( label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size // place_num), 1)) int(args.batch_size // place_num), 1))
out = net(img) out = net(img)
...@@ -57,15 +60,19 @@ def eval(net, test_data_loader, eop): ...@@ -57,15 +60,19 @@ def eval(net, test_data_loader, eop):
avg_loss = fluid.layers.mean(x=loss) avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t2 = time.time()
print( "test | epoch id: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec read_t:%2.4f" % \
(eop, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), t2 - t1 , t1 - t_last))
sys.stdout.flush()
total_loss += avg_loss.numpy() total_loss += avg_loss.numpy()
total_acc1 += acc_top1.numpy() total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy() total_acc5 += acc_top5.numpy()
test_batch_cost = time.time() - batch_start
total_sample += 1 total_sample += 1
t_last = time.time() print(
"test | epoch %d, avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
% (eop, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(),
test_batch_cost, batch_reader_end - batch_start))
sys.stdout.flush()
batch_start = time.time()
print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \ print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(total_loss / total_sample, \ (total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample)) total_acc1 / total_sample, total_acc5 / total_sample))
...@@ -148,9 +155,12 @@ def train_mobilenet(): ...@@ -148,9 +155,12 @@ def train_mobilenet():
# 4. train loop # 4. train loop
total_batch_num = 0 #this is for benchmark total_batch_num = 0 #this is for benchmark
for eop in range(args.num_epochs): for eop in range(args.num_epochs):
epoch_start = time.time()
if num_trainers > 1: if num_trainers > 1:
imagenet_reader.set_shuffle_seed(eop + ( imagenet_reader.set_shuffle_seed(eop + (
args.random_seed if args.random_seed else 0)) args.random_seed if args.random_seed else 0))
net.train() net.train()
total_loss = 0.0 total_loss = 0.0
total_acc1 = 0.0 total_acc1 = 0.0
...@@ -158,24 +168,23 @@ def train_mobilenet(): ...@@ -158,24 +168,23 @@ def train_mobilenet():
total_sample = 0 total_sample = 0
batch_id = 0 batch_id = 0
t_last = 0 t_last = 0
# 4.1 for each batch, call net() , backward(), and minimize() # 4.1 for each batch, call net() , backward(), and minimize()
batch_start = time.time()
for img, label in train_data_loader(): for img, label in train_data_loader():
t1 = time.time()
if args.max_iter and total_batch_num == args.max_iter: if args.max_iter and total_batch_num == args.max_iter:
return return
t_start = time.time() batch_reader_end = time.time()
# 4.1.1 call net() # 4.1.1 call net()
out = net(img) out = net(img)
t_end = time.time()
softmax_out = fluid.layers.softmax(out, use_cudnn=False) softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy( loss = fluid.layers.cross_entropy(
input=softmax_out, label=label) input=softmax_out, label=label)
avg_loss = fluid.layers.mean(x=loss) avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t_start_back = time.time() batch_net_end = time.time()
# 4.1.2 call backward() # 4.1.2 call backward()
if args.use_data_parallel: if args.use_data_parallel:
...@@ -184,37 +193,43 @@ def train_mobilenet(): ...@@ -184,37 +193,43 @@ def train_mobilenet():
net.apply_collective_grads() net.apply_collective_grads()
else: else:
avg_loss.backward() avg_loss.backward()
batch_backward_end = time.time()
t_end_back = time.time()
# 4.1.3 call minimize() # 4.1.3 call minimize()
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
net.clear_gradients() net.clear_gradients()
t2 = time.time() t2 = time.time()
train_batch_elapse = t2 - t1
if batch_id % args.print_step == 0:
print( "epoch id: %d, batch step: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec net_t:%2.4f back_t:%2.4f read_t:%2.4f" % \
(eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), train_batch_elapse,
t_end - t_start, t_end_back - t_start_back, t1 - t_last))
sys.stdout.flush()
total_loss += avg_loss.numpy() total_loss += avg_loss.numpy()
total_acc1 += acc_top1.numpy() total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy() total_acc5 += acc_top5.numpy()
total_sample += 1 total_sample += 1
batch_id += 1 batch_id += 1
t_last = time.time()
# NOTE: used for benchmark # NOTE: used for benchmark
train_batch_cost = time.time() - batch_start
total_batch_num = total_batch_num + 1 total_batch_num = total_batch_num + 1
if batch_id % args.print_step == 0:
print(
"[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f s, net_t: %.5f s, backward_t: %.5f s, reader_t: %.5f s"
% (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(),
acc_top5.numpy(), train_batch_cost,
batch_net_end - batch_reader_end,
batch_backward_end - batch_net_end,
batch_reader_end - batch_start))
sys.stdout.flush()
batch_start = time.time()
if args.ce: if args.ce:
print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
(eop, batch_id, total_loss / total_sample, \ train_epoch_cost = time.time() - epoch_start
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse)) print(
"[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s"
% (eop, total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample, train_epoch_cost))
# 4.2 save checkpoint # 4.2 save checkpoint
save_parameters = (not args.use_data_parallel) or ( save_parameters = (not args.use_data_parallel) or (
......
...@@ -32,9 +32,6 @@ import time ...@@ -32,9 +32,6 @@ import time
from args import * from args import *
#import fluid.clip as clip
#from fluid.clip import *
import sys import sys
if sys.version[0] == '2': if sys.version[0] == '2':
reload(sys) reload(sys)
...@@ -386,9 +383,11 @@ def train_ptb_lm(): ...@@ -386,9 +383,11 @@ def train_ptb_lm():
ce_time = [] ce_time = []
ce_ppl = [] ce_ppl = []
total_batch_num = 0 #this is for benchmark total_batch_num = 0 #this is for benchmark
for epoch_id in range(max_epoch): for epoch_id in range(max_epoch):
epoch_start = time.time()
ptb_model.train() ptb_model.train()
total_loss = 0.0 total_loss = 0.0
iters = 0.0 iters = 0.0
...@@ -405,11 +404,11 @@ def train_ptb_lm(): ...@@ -405,11 +404,11 @@ def train_ptb_lm():
init_hidden = to_variable(init_hidden_data) init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data) init_cell = to_variable(init_cell_data)
start_time = time.time()
batch_start = time.time()
for batch_id, batch in enumerate(train_data_loader): for batch_id, batch in enumerate(train_data_loader):
if args.max_iter and total_batch_num == args.max_iter: if args.max_iter and total_batch_num == args.max_iter:
return return
batch_start = time.time()
x, y = batch x, y = batch
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
...@@ -423,23 +422,26 @@ def train_ptb_lm(): ...@@ -423,23 +422,26 @@ def train_ptb_lm():
ptb_model.clear_gradients() ptb_model.clear_gradients()
total_loss += out_loss total_loss += out_loss
batch_end = time.time()
train_batch_cost = batch_end - batch_start
iters += num_steps iters += num_steps
total_batch_num = total_batch_num + 1 #this is for benchmark total_batch_num = total_batch_num + 1 #this is for benchmark
train_batch_cost = time.time() - batch_start
if batch_id > 0 and batch_id % log_interval == 0: if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" % print(
(epoch_id, batch_id, ppl[0], "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch_cost: %.5f s"
sgd._global_learning_rate().numpy(), out_loss, train_batch_cost)) % (epoch_id, batch_id, ppl[0],
sgd._global_learning_rate().numpy(), out_loss,
train_batch_cost))
batch_start = time.time()
print("one epoch finished", epoch_id)
print("time cost ", time.time() - start_time)
ppl = np.exp(total_loss / iters) ppl = np.exp(total_loss / iters)
ce_time.append(time.time() - start_time) train_epoch_cost = time.time() - epoch_start
print("-- Epoch:[%d]; ppl: %.5f, epoch_cost: %.5f s" %
(epoch_id, ppl[0], train_epoch_cost))
ce_time.append(train_epoch_cost)
ce_ppl.append(ppl[0]) ce_ppl.append(ppl[0])
print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
# for bad init, after first epoch, the loss is over 1000 # for bad init, after first epoch, the loss is over 1000
......
...@@ -190,7 +190,10 @@ class ImageNetReader: ...@@ -190,7 +190,10 @@ class ImageNetReader:
full_lines = [line.strip() for line in flist] full_lines = [line.strip() for line in flist]
if mode != "test" and len(full_lines) < settings.batch_size: if mode != "test" and len(full_lines) < settings.batch_size:
print( print(
"Warning: The number of the whole data ({}) is smaller than the batch_size ({}), and drop_last is turnning on, so nothing will feed in program, Terminated now. Please reset batch_size to a smaller number or feed more data!" "Warning: The number of the whole data ({}) is smaller "
"than the batch_size ({}), and drop_last is turnning on, "
"so nothing will feed in program, Terminated now. Please "
"reset batch_size to a smaller number or feed more data!"
.format(len(full_lines), settings.batch_size)) .format(len(full_lines), settings.batch_size))
os._exit(1) os._exit(1)
if shuffle: if shuffle:
......
...@@ -331,8 +331,6 @@ def eval(model, data): ...@@ -331,8 +331,6 @@ def eval(model, data):
label.stop_gradient = True label.stop_gradient = True
out = model(img) out = model(img)
#loss = fluid.layers.cross_entropy(input=out, label=label)
#avg_loss = fluid.layers.mean(x=loss)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
...@@ -344,7 +342,6 @@ def eval(model, data): ...@@ -344,7 +342,6 @@ def eval(model, data):
total_acc5 += acc_top5.numpy() total_acc5 += acc_top5.numpy()
total_sample += 1 total_sample += 1
# print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
if batch_id % 10 == 0: if batch_id % 10 == 0:
print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \ print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \
( batch_id, total_acc1 / total_sample, total_acc5 / total_sample)) ( batch_id, total_acc1 / total_sample, total_acc5 / total_sample))
...@@ -413,13 +410,11 @@ def train_resnet(): ...@@ -413,13 +410,11 @@ def train_resnet():
use_multiprocess=True) use_multiprocess=True)
test_loader.set_sample_list_generator(test_reader, places=place) test_loader.set_sample_list_generator(test_reader, places=place)
#file_name = './model/epoch_0.npz'
#model_data = np.load( file_name )
#NOTE: used in benchmark #NOTE: used in benchmark
total_batch_num = 0 total_batch_num = 0
for eop in range(epoch): for eop in range(epoch):
epoch_start = time.time()
resnet.train() resnet.train()
total_loss = 0.0 total_loss = 0.0
...@@ -427,17 +422,13 @@ def train_resnet(): ...@@ -427,17 +422,13 @@ def train_resnet():
total_acc5 = 0.0 total_acc5 = 0.0
total_sample = 0 total_sample = 0
#dict_state = resnet.state_dict() batch_start = time.time()
#resnet.load_dict( model_data )
print("load finished")
for batch_id, data in enumerate(train_loader()): for batch_id, data in enumerate(train_loader()):
#NOTE: used in benchmark #NOTE: used in benchmark
if args.max_iter and total_batch_num == args.max_iter: if args.max_iter and total_batch_num == args.max_iter:
return return
batch_start = time.time()
train_reader_cost = time.time() - batch_start
img, label = data img, label = data
label.stop_gradient = True label.stop_gradient = True
...@@ -461,26 +452,32 @@ def train_resnet(): ...@@ -461,26 +452,32 @@ def train_resnet():
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
resnet.clear_gradients() resnet.clear_gradients()
batch_end = time.time()
train_batch_cost = batch_end - batch_start
total_loss += dy_out total_loss += dy_out
total_acc1 += acc_top1.numpy() total_acc1 += acc_top1.numpy()
total_acc5 += acc_top5.numpy() total_acc5 += acc_top5.numpy()
total_sample += 1 total_sample += 1
train_batch_cost = time.time() - batch_start
total_batch_num = total_batch_num + 1 #this is for benchmark total_batch_num = total_batch_num + 1 #this is for benchmark
#print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
if batch_id % 10 == 0: if batch_id % 10 == 0:
print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f, batch cost: %.5f" % \ print(
( eop, batch_id, total_loss / total_sample, \ "[Epoch %d, batch %d] loss %.5f, acc1 %.5f, acc5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_cost)) % (eop, batch_id, total_loss / total_sample,
total_acc1 / total_sample, total_acc5 / total_sample,
train_batch_cost, train_reader_cost))
batch_start = time.time()
if args.ce: if args.ce:
print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample))
print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample))
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
(eop, batch_id, total_loss / total_sample, \ train_epoch_cost = time.time() - epoch_start
total_acc1 / total_sample, total_acc5 / total_sample)) print(
"[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s"
% (eop, total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample, train_epoch_cost))
resnet.eval() resnet.eval()
eval(resnet, test_loader) eval(resnet, test_loader)
......
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
...@@ -30,6 +30,7 @@ alpha = 0.6 ...@@ -30,6 +30,7 @@ alpha = 0.6
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
zero_constant = fluid.initializer.Constant(0.0) zero_constant = fluid.initializer.Constant(0.0)
class AttentionModel(fluid.dygraph.Layer): class AttentionModel(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
hidden_size, hidden_size,
...@@ -79,55 +80,61 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -79,55 +80,61 @@ class AttentionModel(fluid.dygraph.Layer):
self.enc_units = [] self.enc_units = []
for i in range(num_layers): for i in range(num_layers):
self.enc_units.append( self.enc_units.append(
self.add_sublayer("enc_units_%d" % i, self.add_sublayer(
"enc_units_%d" % i,
BasicLSTMUnit( BasicLSTMUnit(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
input_size=self.hidden_size, input_size=self.hidden_size,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
forget_bias=forget_bias))) forget_bias=forget_bias)))
self.dec_units = [] self.dec_units = []
for i in range(num_layers): for i in range(num_layers):
if i == 0: if i == 0:
self.dec_units.append( self.dec_units.append(
self.add_sublayer("dec_units_%d" % i, self.add_sublayer(
"dec_units_%d" % i,
BasicLSTMUnit( BasicLSTMUnit(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
input_size=self.hidden_size * 2, input_size=self.hidden_size * 2,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
forget_bias=forget_bias))) forget_bias=forget_bias)))
else: else:
self.dec_units.append( self.dec_units.append(
self.add_sublayer("dec_units_%d" % i, self.add_sublayer(
BasicLSTMUnit( "dec_units_%d" % i,
hidden_size=self.hidden_size, BasicLSTMUnit(
input_size=self.hidden_size, hidden_size=self.hidden_size,
param_attr=param_attr, input_size=self.hidden_size,
bias_attr=bias_attr, param_attr=param_attr,
forget_bias=forget_bias))) bias_attr=bias_attr,
forget_bias=forget_bias)))
self.fc = fluid.dygraph.nn.Linear(self.hidden_size,
self.tar_vocab_size, self.fc = fluid.dygraph.nn.Linear(
param_attr=param_attr, self.hidden_size,
bias_attr=False) self.tar_vocab_size,
param_attr=param_attr,
self.attn_fc = fluid.dygraph.nn.Linear(self.hidden_size, bias_attr=False)
self.hidden_size,
param_attr=param_attr, self.attn_fc = fluid.dygraph.nn.Linear(
bias_attr=False) self.hidden_size,
self.hidden_size,
self.concat_fc = fluid.dygraph.nn.Linear(2 * self.hidden_size, param_attr=param_attr,
self.hidden_size, bias_attr=False)
param_attr=param_attr,
bias_attr=False) self.concat_fc = fluid.dygraph.nn.Linear(
2 * self.hidden_size,
self.hidden_size,
param_attr=param_attr,
bias_attr=False)
def _transpose_batch_time(self, x): def _transpose_batch_time(self, x):
return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
def _merge_batch_beams(self, x): def _merge_batch_beams(self, x):
return fluid.layers.reshape(x, shape=(-1,x.shape[2])) return fluid.layers.reshape(x, shape=(-1, x.shape[2]))
def tile_beam_merge_with_batch(self, x): def tile_beam_merge_with_batch(self, x):
x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...] x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...]
...@@ -135,7 +142,7 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -135,7 +142,7 @@ class AttentionModel(fluid.dygraph.Layer):
expand_times[1] = self.beam_size expand_times[1] = self.beam_size
x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...] x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...]
x = fluid.layers.transpose(x, list(range(2, len(x.shape))) + x = fluid.layers.transpose(x, list(range(2, len(x.shape))) +
[0, 1]) # [..., batch_size, beam_size] [0, 1]) # [..., batch_size, beam_size]
# use 0 to copy to avoid wrong shape # use 0 to copy to avoid wrong shape
x = fluid.layers.reshape( x = fluid.layers.reshape(
x, shape=[0] * x, shape=[0] *
...@@ -159,7 +166,7 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -159,7 +166,7 @@ class AttentionModel(fluid.dygraph.Layer):
new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \ new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \
fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0) fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0)
return new_state return new_state
def _gather(self, x, indices, batch_pos): def _gather(self, x, indices, batch_pos):
topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
return fluid.layers.gather_nd(x, topk_coordinates) return fluid.layers.gather_nd(x, topk_coordinates)
...@@ -184,12 +191,19 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -184,12 +191,19 @@ class AttentionModel(fluid.dygraph.Layer):
if src.shape[0] < self.batch_size: if src.shape[0] < self.batch_size:
self.batch_size = src.shape[0] self.batch_size = src.shape[0]
src_emb = self.src_embeder(self._transpose_batch_time(src)) src_emb = self.src_embeder(self._transpose_batch_time(src))
enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) enc_hidden = to_variable(
enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
enc_cell = to_variable(
np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
max_seq_len = src_emb.shape[0] max_seq_len = src_emb.shape[0]
enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32") enc_len_mask = fluid.layers.sequence_mask(
src_sequence_length, maxlen=max_seq_len, dtype="float32")
enc_padding_mask = (enc_len_mask - 1.0) enc_padding_mask = (enc_len_mask - 1.0)
enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
enc_states = [[enc_hidden, enc_cell]] enc_states = [[enc_hidden, enc_cell]]
...@@ -200,7 +214,8 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -200,7 +214,8 @@ class AttentionModel(fluid.dygraph.Layer):
enc_hidden, enc_cell = enc_states[l] enc_hidden, enc_cell = enc_states[l]
new_enc_hidden, new_enc_cell = [], [] new_enc_hidden, new_enc_cell = [], []
for i in range(self.num_layers): for i in range(self.num_layers):
new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i]) new_hidden, new_cell = self.enc_units[i](
step_input, enc_hidden[i], enc_cell[i])
new_enc_hidden.append(new_hidden) new_enc_hidden.append(new_hidden)
new_enc_cell.append(new_cell) new_enc_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0: if self.dropout != None and self.dropout > 0.0:
...@@ -210,17 +225,25 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -210,17 +225,25 @@ class AttentionModel(fluid.dygraph.Layer):
dropout_implementation='upscale_in_train') dropout_implementation='upscale_in_train')
else: else:
step_input = new_hidden step_input = new_hidden
new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)] new_enc_hidden = [
new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)] self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask)
for i in range(self.num_layers)
]
new_enc_cell = [
self._real_state(enc_cell[i], new_enc_cell[i], step_mask)
for i in range(self.num_layers)
]
enc_states.append([new_enc_hidden, new_enc_cell]) enc_states.append([new_enc_hidden, new_enc_cell])
enc_outputs.append(step_input) enc_outputs.append(step_input)
enc_outputs = fluid.layers.stack(enc_outputs) enc_outputs = fluid.layers.stack(enc_outputs)
enc_outputs = self._transpose_batch_time(enc_outputs) enc_outputs = self._transpose_batch_time(enc_outputs)
if self.mode in ['train', 'eval']: if self.mode in ['train', 'eval']:
# calculation with input_feed derives from paper: https://arxiv.org/pdf/1508.04025.pdf # calculation with input_feed derives from paper: https://arxiv.org/pdf/1508.04025.pdf
input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32')) input_feed = to_variable(
np.zeros(
(self.batch_size, self.hidden_size), dtype='float32'))
dec_hidden, dec_cell = enc_states[-1] dec_hidden, dec_cell = enc_states[-1]
tar_emb = self.tar_embeder(self._transpose_batch_time(tar)) tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
max_seq_len = tar_emb.shape[0] max_seq_len = tar_emb.shape[0]
...@@ -231,7 +254,8 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -231,7 +254,8 @@ class AttentionModel(fluid.dygraph.Layer):
step_input = fluid.layers.concat([step_input, input_feed], 1) step_input = fluid.layers.concat([step_input, input_feed], 1)
new_dec_hidden, new_dec_cell = [], [] new_dec_hidden, new_dec_cell = [], []
for i in range(self.num_layers): for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden) new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell) new_dec_cell.append(new_cell)
...@@ -243,7 +267,8 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -243,7 +267,8 @@ class AttentionModel(fluid.dygraph.Layer):
else: else:
step_input = new_hidden step_input = new_hidden
dec_att = self.attention(step_input, enc_outputs, enc_padding_mask) dec_att = self.attention(step_input, enc_outputs,
enc_padding_mask)
dec_att = fluid.layers.squeeze(dec_att, [1]) dec_att = fluid.layers.squeeze(dec_att, [1])
concat_att_out = fluid.layers.concat([dec_att, step_input], 1) concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
out = self.concat_fc(concat_att_out) out = self.concat_fc(concat_att_out)
...@@ -253,9 +278,9 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -253,9 +278,9 @@ class AttentionModel(fluid.dygraph.Layer):
dec_output = fluid.layers.stack(dec_output) dec_output = fluid.layers.stack(dec_output)
dec_output = self.fc(self._transpose_batch_time(dec_output)) dec_output = self.fc(self._transpose_batch_time(dec_output))
loss = fluid.layers.softmax_with_cross_entropy( loss = fluid.layers.softmax_with_cross_entropy(
logits=dec_output, label=label, soft_label=False) logits=dec_output, label=label, soft_label=False)
loss = fluid.layers.squeeze(loss, axes=[2]) loss = fluid.layers.squeeze(loss, axes=[2])
max_tar_seq_len = fluid.layers.shape(tar)[1] max_tar_seq_len = fluid.layers.shape(tar)[1]
tar_mask = fluid.layers.sequence_mask( tar_mask = fluid.layers.sequence_mask(
...@@ -264,28 +289,45 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -264,28 +289,45 @@ class AttentionModel(fluid.dygraph.Layer):
loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss) loss = fluid.layers.reduce_sum(loss)
return loss return loss
elif self.mode in ['beam_search']: elif self.mode in ['beam_search']:
enc_outputs = self.tile_beam_merge_with_batch(enc_outputs) enc_outputs = self.tile_beam_merge_with_batch(enc_outputs)
enc_padding_mask = self.tile_beam_merge_with_batch(enc_padding_mask) enc_padding_mask = self.tile_beam_merge_with_batch(enc_padding_mask)
batch_beam_shape = (self.batch_size, self.beam_size) batch_beam_shape = (self.batch_size, self.beam_size)
vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size)) vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size))
start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64')) start_token_tensor = to_variable(
end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64')) np.full(
batch_beam_shape, self.beam_start_token, dtype='int64'))
end_token_tensor = to_variable(
np.full(
batch_beam_shape, self.beam_end_token, dtype='int64'))
step_input = self.tar_embeder(start_token_tensor) step_input = self.tar_embeder(start_token_tensor)
input_feed = to_variable(np.zeros((self.batch_size, self.hidden_size), dtype='float32')) input_feed = to_variable(
np.zeros(
(self.batch_size, self.hidden_size), dtype='float32'))
input_feed = self._expand_to_beam_size(input_feed) input_feed = self._expand_to_beam_size(input_feed)
input_feed = self._merge_batch_beams(input_feed) input_feed = self._merge_batch_beams(input_feed)
beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32')) beam_finished = to_variable(
beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")) np.full(
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1]) batch_beam_shape, 0, dtype='float32'))
beam_state_log_probs = to_variable(
np.array(
[[0.] + [-self.kinf] * (self.beam_size - 1)],
dtype="float32"))
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs,
[self.batch_size, 1])
dec_hidden, dec_cell = enc_states[-1] dec_hidden, dec_cell = enc_states[-1]
dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden] dec_hidden = [
self._expand_to_beam_size(state) for state in dec_hidden
]
dec_cell = [self._expand_to_beam_size(state) for state in dec_cell] dec_cell = [self._expand_to_beam_size(state) for state in dec_cell]
batch_pos = fluid.layers.expand( batch_pos = fluid.layers.expand(
fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]), fluid.layers.unsqueeze(
to_variable(
np.arange(
0, self.batch_size, 1, dtype="int64")), [1]),
[1, self.beam_size]) [1, self.beam_size])
predicted_ids = [] predicted_ids = []
parent_ids = [] parent_ids = []
...@@ -296,11 +338,16 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -296,11 +338,16 @@ class AttentionModel(fluid.dygraph.Layer):
step_input = self._merge_batch_beams(step_input) step_input = self._merge_batch_beams(step_input)
step_input = fluid.layers.concat([step_input, input_feed], 1) step_input = fluid.layers.concat([step_input, input_feed], 1)
new_dec_hidden, new_dec_cell = [], [] new_dec_hidden, new_dec_cell = [], []
dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden] dec_hidden = [
dec_cell = [self._merge_batch_beams(state) for state in dec_cell] self._merge_batch_beams(state) for state in dec_hidden
]
dec_cell = [
self._merge_batch_beams(state) for state in dec_cell
]
for i in range(self.num_layers): for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden) new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell) new_dec_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0: if self.dropout != None and self.dropout > 0.0:
...@@ -310,17 +357,23 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -310,17 +357,23 @@ class AttentionModel(fluid.dygraph.Layer):
dropout_implementation='upscale_in_train') dropout_implementation='upscale_in_train')
else: else:
step_input = new_hidden step_input = new_hidden
dec_att = self.attention(step_input, enc_outputs, enc_padding_mask) dec_att = self.attention(step_input, enc_outputs,
enc_padding_mask)
dec_att = fluid.layers.squeeze(dec_att, [1]) dec_att = fluid.layers.squeeze(dec_att, [1])
concat_att_out = fluid.layers.concat([dec_att, step_input], 1) concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
out = self.concat_fc(concat_att_out) out = self.concat_fc(concat_att_out)
input_feed = out input_feed = out
cell_outputs = self._split_batch_beams(out) cell_outputs = self._split_batch_beams(out)
cell_outputs = self.fc(cell_outputs) cell_outputs = self.fc(cell_outputs)
step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs)) step_log_probs = fluid.layers.log(
fluid.layers.softmax(cell_outputs))
noend_array = [-self.kinf] * self.tar_vocab_size noend_array = [-self.kinf] * self.tar_vocab_size
noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] noend_array[
noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32')) self.
beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...]
noend_mask_tensor = to_variable(
np.array(
noend_array, dtype='float32'))
# set finished position to one-hot probability of <eos> # set finished position to one-hot probability of <eos>
step_log_probs = fluid.layers.elementwise_mul( step_log_probs = fluid.layers.elementwise_mul(
fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]), fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]),
...@@ -328,20 +381,38 @@ class AttentionModel(fluid.dygraph.Layer): ...@@ -328,20 +381,38 @@ class AttentionModel(fluid.dygraph.Layer):
fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0) fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0)
log_probs = fluid.layers.elementwise_add( log_probs = fluid.layers.elementwise_add(
x=step_log_probs, y=beam_state_log_probs, axis=0) x=step_log_probs, y=beam_state_log_probs, axis=0)
scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size]) scores = fluid.layers.reshape(
topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size) log_probs, [-1, self.beam_size * self.tar_vocab_size])
beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam topk_scores, topk_indices = fluid.layers.topk(
token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam input=scores, k=self.beam_size)
next_log_probs = self._gather(scores, topk_indices, batch_pos) # beam_indices = fluid.layers.elementwise_floordiv(
topk_indices, vocab_size_tensor) # in which beam
new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden] token_indices = fluid.layers.elementwise_mod(
new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell] topk_indices, vocab_size_tensor) # position in beam
new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden] next_log_probs = self._gather(scores, topk_indices,
new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell] batch_pos) #
next_finished = self._gather(beam_finished, beam_indices, batch_pos) new_dec_hidden = [
self._split_batch_beams(state) for state in new_dec_hidden
]
new_dec_cell = [
self._split_batch_beams(state) for state in new_dec_cell
]
new_dec_hidden = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_hidden
]
new_dec_cell = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_cell
]
next_finished = self._gather(beam_finished, beam_indices,
batch_pos)
next_finished = fluid.layers.cast(next_finished, "bool") next_finished = fluid.layers.cast(next_finished, "bool")
next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor)) next_finished = fluid.layers.logical_or(
next_finished,
fluid.layers.equal(token_indices, end_token_tensor))
next_finished = fluid.layers.cast(next_finished, "float32") next_finished = fluid.layers.cast(next_finished, "float32")
# prepare for next step # prepare for next step
dec_hidden, dec_cell = new_dec_hidden, new_dec_cell dec_hidden, dec_cell = new_dec_hidden, new_dec_cell
......
...@@ -30,6 +30,7 @@ alpha = 0.6 ...@@ -30,6 +30,7 @@ alpha = 0.6
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x)
zero_constant = fluid.initializer.Constant(0.0) zero_constant = fluid.initializer.Constant(0.0)
class BaseModel(fluid.dygraph.Layer): class BaseModel(fluid.dygraph.Layer):
def __init__(self, def __init__(self,
hidden_size, hidden_size,
...@@ -77,35 +78,38 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -77,35 +78,38 @@ class BaseModel(fluid.dygraph.Layer):
self.enc_units = [] self.enc_units = []
for i in range(num_layers): for i in range(num_layers):
self.enc_units.append( self.enc_units.append(
self.add_sublayer("enc_units_%d" % i, self.add_sublayer(
"enc_units_%d" % i,
BasicLSTMUnit( BasicLSTMUnit(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
input_size=self.hidden_size, input_size=self.hidden_size,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
forget_bias=forget_bias))) forget_bias=forget_bias)))
self.dec_units = [] self.dec_units = []
for i in range(num_layers): for i in range(num_layers):
self.dec_units.append( self.dec_units.append(
self.add_sublayer("dec_units_%d" % i, self.add_sublayer(
"dec_units_%d" % i,
BasicLSTMUnit( BasicLSTMUnit(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
input_size=self.hidden_size, input_size=self.hidden_size,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
forget_bias=forget_bias))) forget_bias=forget_bias)))
self.fc = fluid.dygraph.nn.Linear(self.hidden_size, self.fc = fluid.dygraph.nn.Linear(
self.tar_vocab_size, self.hidden_size,
param_attr=param_attr, self.tar_vocab_size,
bias_attr=False) param_attr=param_attr,
bias_attr=False)
def _transpose_batch_time(self, x): def _transpose_batch_time(self, x):
return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
def _merge_batch_beams(self, x): def _merge_batch_beams(self, x):
return fluid.layers.reshape(x, shape=(-1,x.shape[2])) return fluid.layers.reshape(x, shape=(-1, x.shape[2]))
def _split_batch_beams(self, x): def _split_batch_beams(self, x):
return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1])) return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1]))
...@@ -135,11 +139,18 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -135,11 +139,18 @@ class BaseModel(fluid.dygraph.Layer):
self.batch_size = src.shape[0] self.batch_size = src.shape[0]
src_emb = self.src_embeder(self._transpose_batch_time(src)) src_emb = self.src_embeder(self._transpose_batch_time(src))
enc_hidden = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) enc_hidden = to_variable(
enc_cell = to_variable(np.zeros((self.num_layers, self.batch_size, self.hidden_size), dtype='float32')) np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
enc_cell = to_variable(
np.zeros(
(self.num_layers, self.batch_size, self.hidden_size),
dtype='float32'))
max_seq_len = src_emb.shape[0] max_seq_len = src_emb.shape[0]
enc_len_mask = fluid.layers.sequence_mask(src_sequence_length, maxlen=max_seq_len, dtype="float32") enc_len_mask = fluid.layers.sequence_mask(
src_sequence_length, maxlen=max_seq_len, dtype="float32")
enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
enc_states = [[enc_hidden, enc_cell]] enc_states = [[enc_hidden, enc_cell]]
for l in range(max_seq_len): for l in range(max_seq_len):
...@@ -148,7 +159,8 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -148,7 +159,8 @@ class BaseModel(fluid.dygraph.Layer):
enc_hidden, enc_cell = enc_states[l] enc_hidden, enc_cell = enc_states[l]
new_enc_hidden, new_enc_cell = [], [] new_enc_hidden, new_enc_cell = [], []
for i in range(self.num_layers): for i in range(self.num_layers):
new_hidden, new_cell = self.enc_units[i](step_input, enc_hidden[i], enc_cell[i]) new_hidden, new_cell = self.enc_units[i](
step_input, enc_hidden[i], enc_cell[i])
new_enc_hidden.append(new_hidden) new_enc_hidden.append(new_hidden)
new_enc_cell.append(new_cell) new_enc_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0: if self.dropout != None and self.dropout > 0.0:
...@@ -158,10 +170,16 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -158,10 +170,16 @@ class BaseModel(fluid.dygraph.Layer):
dropout_implementation='upscale_in_train') dropout_implementation='upscale_in_train')
else: else:
step_input = new_hidden step_input = new_hidden
new_enc_hidden = [self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask) for i in range(self.num_layers)] new_enc_hidden = [
new_enc_cell = [self._real_state(enc_cell[i], new_enc_cell[i], step_mask) for i in range(self.num_layers)] self._real_state(enc_hidden[i], new_enc_hidden[i], step_mask)
for i in range(self.num_layers)
]
new_enc_cell = [
self._real_state(enc_cell[i], new_enc_cell[i], step_mask)
for i in range(self.num_layers)
]
enc_states.append([new_enc_hidden, new_enc_cell]) enc_states.append([new_enc_hidden, new_enc_cell])
if self.mode in ['train', 'eval']: if self.mode in ['train', 'eval']:
dec_hidden, dec_cell = enc_states[-1] dec_hidden, dec_cell = enc_states[-1]
tar_emb = self.tar_embeder(self._transpose_batch_time(tar)) tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
...@@ -172,7 +190,8 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -172,7 +190,8 @@ class BaseModel(fluid.dygraph.Layer):
step_input = tar_emb[step_idx] step_input = tar_emb[step_idx]
new_dec_hidden, new_dec_cell = [], [] new_dec_hidden, new_dec_cell = [], []
for i in range(self.num_layers): for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden) new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell) new_dec_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0: if self.dropout != None and self.dropout > 0.0:
...@@ -187,9 +206,9 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -187,9 +206,9 @@ class BaseModel(fluid.dygraph.Layer):
dec_output = fluid.layers.stack(dec_output) dec_output = fluid.layers.stack(dec_output)
dec_output = self.fc(self._transpose_batch_time(dec_output)) dec_output = self.fc(self._transpose_batch_time(dec_output))
loss = fluid.layers.softmax_with_cross_entropy( loss = fluid.layers.softmax_with_cross_entropy(
logits=dec_output, label=label, soft_label=False) logits=dec_output, label=label, soft_label=False)
loss = fluid.layers.squeeze(loss, axes=[2]) loss = fluid.layers.squeeze(loss, axes=[2])
max_tar_seq_len = fluid.layers.shape(tar)[1] max_tar_seq_len = fluid.layers.shape(tar)[1]
tar_mask = fluid.layers.sequence_mask( tar_mask = fluid.layers.sequence_mask(
...@@ -202,19 +221,34 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -202,19 +221,34 @@ class BaseModel(fluid.dygraph.Layer):
batch_beam_shape = (self.batch_size, self.beam_size) batch_beam_shape = (self.batch_size, self.beam_size)
#batch_beam_shape_1 = (self.batch_size, self.beam_size, 1) #batch_beam_shape_1 = (self.batch_size, self.beam_size, 1)
vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size)) vocab_size_tensor = to_variable(np.full((1), self.tar_vocab_size))
start_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_start_token, dtype='int64')) start_token_tensor = to_variable(
end_token_tensor = to_variable(np.full(batch_beam_shape, self.beam_end_token, dtype='int64')) np.full(
batch_beam_shape, self.beam_start_token, dtype='int64'))
end_token_tensor = to_variable(
np.full(
batch_beam_shape, self.beam_end_token, dtype='int64'))
step_input = self.tar_embeder(start_token_tensor) step_input = self.tar_embeder(start_token_tensor)
beam_finished = to_variable(np.full(batch_beam_shape, 0, dtype='float32')) beam_finished = to_variable(
beam_state_log_probs = to_variable(np.array([[0.] + [-self.kinf] * (self.beam_size - 1)], dtype="float32")) np.full(
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs, [self.batch_size, 1]) batch_beam_shape, 0, dtype='float32'))
beam_state_log_probs = to_variable(
np.array(
[[0.] + [-self.kinf] * (self.beam_size - 1)],
dtype="float32"))
beam_state_log_probs = fluid.layers.expand(beam_state_log_probs,
[self.batch_size, 1])
dec_hidden, dec_cell = enc_states[-1] dec_hidden, dec_cell = enc_states[-1]
dec_hidden = [self._expand_to_beam_size(state) for state in dec_hidden] dec_hidden = [
self._expand_to_beam_size(state) for state in dec_hidden
]
dec_cell = [self._expand_to_beam_size(state) for state in dec_cell] dec_cell = [self._expand_to_beam_size(state) for state in dec_cell]
batch_pos = fluid.layers.expand( batch_pos = fluid.layers.expand(
fluid.layers.unsqueeze(to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1]), fluid.layers.unsqueeze(
to_variable(
np.arange(
0, self.batch_size, 1, dtype="int64")), [1]),
[1, self.beam_size]) [1, self.beam_size])
predicted_ids = [] predicted_ids = []
parent_ids = [] parent_ids = []
...@@ -224,11 +258,16 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -224,11 +258,16 @@ class BaseModel(fluid.dygraph.Layer):
break break
step_input = self._merge_batch_beams(step_input) step_input = self._merge_batch_beams(step_input)
new_dec_hidden, new_dec_cell = [], [] new_dec_hidden, new_dec_cell = [], []
dec_hidden = [self._merge_batch_beams(state) for state in dec_hidden] dec_hidden = [
dec_cell = [self._merge_batch_beams(state) for state in dec_cell] self._merge_batch_beams(state) for state in dec_hidden
]
dec_cell = [
self._merge_batch_beams(state) for state in dec_cell
]
for i in range(self.num_layers): for i in range(self.num_layers):
new_hidden, new_cell = self.dec_units[i](step_input, dec_hidden[i], dec_cell[i]) new_hidden, new_cell = self.dec_units[i](
step_input, dec_hidden[i], dec_cell[i])
new_dec_hidden.append(new_hidden) new_dec_hidden.append(new_hidden)
new_dec_cell.append(new_cell) new_dec_cell.append(new_cell)
if self.dropout != None and self.dropout > 0.0: if self.dropout != None and self.dropout > 0.0:
...@@ -239,12 +278,17 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -239,12 +278,17 @@ class BaseModel(fluid.dygraph.Layer):
else: else:
step_input = new_hidden step_input = new_hidden
cell_outputs = self._split_batch_beams(step_input) cell_outputs = self._split_batch_beams(step_input)
cell_outputs = self.fc(cell_outputs) cell_outputs = self.fc(cell_outputs)
# Beam_search_step: # Beam_search_step:
step_log_probs = fluid.layers.log(fluid.layers.softmax(cell_outputs)) step_log_probs = fluid.layers.log(
fluid.layers.softmax(cell_outputs))
noend_array = [-self.kinf] * self.tar_vocab_size noend_array = [-self.kinf] * self.tar_vocab_size
noend_array[self.beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...] noend_array[
noend_mask_tensor = to_variable(np.array(noend_array,dtype='float32')) self.
beam_end_token] = 0 # [-kinf, -kinf, ..., 0, -kinf, ...]
noend_mask_tensor = to_variable(
np.array(
noend_array, dtype='float32'))
# set finished position to one-hot probability of <eos> # set finished position to one-hot probability of <eos>
step_log_probs = fluid.layers.elementwise_mul( step_log_probs = fluid.layers.elementwise_mul(
fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]), fluid.layers.expand(fluid.layers.unsqueeze(beam_finished, [2]), [1, 1, self.tar_vocab_size]),
...@@ -252,26 +296,45 @@ class BaseModel(fluid.dygraph.Layer): ...@@ -252,26 +296,45 @@ class BaseModel(fluid.dygraph.Layer):
fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0) fluid.layers.elementwise_mul(step_log_probs, (beam_finished - 1), axis=0)
log_probs = fluid.layers.elementwise_add( log_probs = fluid.layers.elementwise_add(
x=step_log_probs, y=beam_state_log_probs, axis=0) x=step_log_probs, y=beam_state_log_probs, axis=0)
scores = fluid.layers.reshape(log_probs, [-1, self.beam_size * self.tar_vocab_size]) scores = fluid.layers.reshape(
topk_scores, topk_indices = fluid.layers.topk(input=scores, k=self.beam_size) log_probs, [-1, self.beam_size * self.tar_vocab_size])
beam_indices = fluid.layers.elementwise_floordiv(topk_indices, vocab_size_tensor) # in which beam topk_scores, topk_indices = fluid.layers.topk(
token_indices = fluid.layers.elementwise_mod(topk_indices, vocab_size_tensor) # position in beam input=scores, k=self.beam_size)
next_log_probs = self._gather(scores, topk_indices, batch_pos) # beam_indices = fluid.layers.elementwise_floordiv(
topk_indices, vocab_size_tensor) # in which beam
token_indices = fluid.layers.elementwise_mod(
topk_indices, vocab_size_tensor) # position in beam
next_log_probs = self._gather(scores, topk_indices,
batch_pos) #
new_dec_hidden = [
self._split_batch_beams(state) for state in new_dec_hidden
]
new_dec_cell = [
self._split_batch_beams(state) for state in new_dec_cell
]
new_dec_hidden = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_hidden
]
new_dec_cell = [
self._gather(x, beam_indices, batch_pos)
for x in new_dec_cell
]
new_dec_hidden = [self._split_batch_beams(state) for state in new_dec_hidden] next_finished = self._gather(beam_finished, beam_indices,
new_dec_cell = [self._split_batch_beams(state) for state in new_dec_cell] batch_pos)
new_dec_hidden = [self._gather(x, beam_indices, batch_pos) for x in new_dec_hidden]
new_dec_cell = [self._gather(x, beam_indices, batch_pos) for x in new_dec_cell]
next_finished = self._gather(beam_finished, beam_indices, batch_pos)
next_finished = fluid.layers.cast(next_finished, "bool") next_finished = fluid.layers.cast(next_finished, "bool")
next_finished = fluid.layers.logical_or(next_finished, fluid.layers.equal(token_indices, end_token_tensor)) next_finished = fluid.layers.logical_or(
next_finished,
fluid.layers.equal(token_indices, end_token_tensor))
next_finished = fluid.layers.cast(next_finished, "float32") next_finished = fluid.layers.cast(next_finished, "float32")
# prepare for next step # prepare for next step
dec_hidden, dec_cell = new_dec_hidden, new_dec_cell dec_hidden, dec_cell = new_dec_hidden, new_dec_cell
beam_finished = next_finished beam_finished = next_finished
beam_state_log_probs = next_log_probs beam_state_log_probs = next_log_probs
step_input = self.tar_embeder(token_indices) # remove unsqueeze in v1.7 step_input = self.tar_embeder(
token_indices) # remove unsqueeze in v1.7
predicted_ids.append(token_indices) predicted_ids.append(token_indices)
parent_ids.append(beam_indices) parent_ids.append(beam_indices)
......
文件模式从 100755 更改为 100644
...@@ -74,7 +74,7 @@ def infer(): ...@@ -74,7 +74,7 @@ def infer():
src_vocab_size, src_vocab_size,
tar_vocab_size, tar_vocab_size,
batch_size, batch_size,
beam_size = args.beam_size, beam_size=args.beam_size,
num_layers=num_layers, num_layers=num_layers,
init_scale=init_scale, init_scale=init_scale,
dropout=0.0, dropout=0.0,
...@@ -85,7 +85,7 @@ def infer(): ...@@ -85,7 +85,7 @@ def infer():
src_vocab_size, src_vocab_size,
tar_vocab_size, tar_vocab_size,
batch_size, batch_size,
beam_size = args.beam_size, beam_size=args.beam_size,
num_layers=num_layers, num_layers=num_layers,
init_scale=init_scale, init_scale=init_scale,
dropout=0.0, dropout=0.0,
...@@ -97,17 +97,17 @@ def infer(): ...@@ -97,17 +97,17 @@ def infer():
infer_data = reader.raw_mono_data(source_vocab_file, infer_file) infer_data = reader.raw_mono_data(source_vocab_file, infer_file)
def prepare_input(batch, epoch_id=0): def prepare_input(batch, epoch_id=0):
src_ids, src_mask, tar_ids, tar_mask = batch src_ids, src_mask, tar_ids, tar_mask = batch
res = {} res = {}
src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1])) src_ids = src_ids.reshape((src_ids.shape[0], src_ids.shape[1]))
in_tar = tar_ids[:, :-1] in_tar = tar_ids[:, :-1]
label_tar = tar_ids[:, 1:] label_tar = tar_ids[:, 1:]
in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1])) in_tar = in_tar.reshape((in_tar.shape[0], in_tar.shape[1]))
label_tar = label_tar.reshape( label_tar = label_tar.reshape(
(label_tar.shape[0], label_tar.shape[1], 1)) (label_tar.shape[0], label_tar.shape[1], 1))
inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask] inputs = [src_ids, in_tar, label_tar, src_mask, tar_mask]
return inputs, np.sum(tar_mask) return inputs, np.sum(tar_mask)
dir_name = args.reload_model dir_name = args.reload_model
print("dir name", dir_name) print("dir name", dir_name)
...@@ -115,7 +115,8 @@ def infer(): ...@@ -115,7 +115,8 @@ def infer():
model.set_dict(state_dict) model.set_dict(state_dict)
model.eval() model.eval()
train_data_iter = reader.get_data_iter(infer_data, batch_size, mode='infer') train_data_iter = reader.get_data_iter(
infer_data, batch_size, mode='infer')
tar_id2vocab = [] tar_id2vocab = []
tar_vocab_file = args.vocab_prefix + "." + args.tar_lang tar_vocab_file = args.vocab_prefix + "." + args.tar_lang
......
文件模式从 100755 更改为 100644
...@@ -191,7 +191,6 @@ def get_data_iter(raw_data, ...@@ -191,7 +191,6 @@ def get_data_iter(raw_data,
new_cache = b_src new_cache = b_src
else: else:
new_cache = sorted(b_src, key=lambda k: len(k[0])) new_cache = sorted(b_src, key=lambda k: len(k[0]))
for i in range(cache_num): for i in range(cache_num):
batch_data = new_cache[i * batch_size:(i + 1) * batch_size] batch_data = new_cache[i * batch_size:(i + 1) * batch_size]
...@@ -212,7 +211,7 @@ def get_data_iter(raw_data, ...@@ -212,7 +211,7 @@ def get_data_iter(raw_data,
for i in range(cache_num): for i in range(cache_num):
batch_end = min(len(new_cache), (i + 1) * batch_size) batch_end = min(len(new_cache), (i + 1) * batch_size)
batch_data = new_cache[i * batch_size: batch_end] batch_data = new_cache[i * batch_size:batch_end]
src_cache = [w[0] for w in batch_data] src_cache = [w[0] for w in batch_data]
tar_cache = [w[1] for w in batch_data] tar_cache = [w[1] for w in batch_data]
src_ids, src_mask = to_pad_np(src_cache, source=True) src_ids, src_mask = to_pad_np(src_cache, source=True)
......
文件模式从 100755 更改为 100644
...@@ -88,9 +88,14 @@ def main(): ...@@ -88,9 +88,14 @@ def main():
lr = args.learning_rate lr = args.learning_rate
opt_type = args.optimizer opt_type = args.optimizer
if opt_type == "sgd": if opt_type == "sgd":
optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip) optimizer = fluid.optimizer.SGD(lr,
parameter_list=model.parameters(),
grad_clip=gloabl_norm_clip)
elif opt_type == "adam": elif opt_type == "adam":
optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip) optimizer = fluid.optimizer.Adam(
lr,
parameter_list=model.parameters(),
grad_clip=gloabl_norm_clip)
else: else:
print("only support [sgd|adam]") print("only support [sgd|adam]")
raise Exception("opt type not support") raise Exception("opt type not support")
...@@ -103,8 +108,8 @@ def main(): ...@@ -103,8 +108,8 @@ def main():
tar_lang = args.tar_lang tar_lang = args.tar_lang
print("begin to load data") print("begin to load data")
raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix, raw_data = reader.raw_data(src_lang, tar_lang, vocab_prefix,
train_data_prefix, eval_data_prefix, train_data_prefix, eval_data_prefix,
test_data_prefix, args.max_len) test_data_prefix, args.max_len)
print("finished load data") print("finished load data")
train_data, valid_data, test_data, _ = raw_data train_data, valid_data, test_data, _ = raw_data
...@@ -128,8 +133,7 @@ def main(): ...@@ -128,8 +133,7 @@ def main():
total_loss = 0.0 total_loss = 0.0
word_count = 0.0 word_count = 0.0
for batch_id, batch in enumerate(eval_data_iter): for batch_id, batch in enumerate(eval_data_iter):
input_data_feed, word_num = prepare_input( input_data_feed, word_num = prepare_input(batch, epoch_id)
batch, epoch_id)
loss = model(input_data_feed) loss = model(input_data_feed)
total_loss += loss * batch_size total_loss += loss * batch_size
...@@ -142,8 +146,9 @@ def main(): ...@@ -142,8 +146,9 @@ def main():
ce_ppl = [] ce_ppl = []
max_epoch = args.max_epoch max_epoch = args.max_epoch
for epoch_id in range(max_epoch): for epoch_id in range(max_epoch):
epoch_start = time.time()
model.train() model.train()
start_time = time.time()
if args.enable_ce: if args.enable_ce:
train_data_iter = reader.get_data_iter( train_data_iter = reader.get_data_iter(
train_data, batch_size, enable_ce=True) train_data, batch_size, enable_ce=True)
...@@ -153,8 +158,11 @@ def main(): ...@@ -153,8 +158,11 @@ def main():
total_loss = 0 total_loss = 0
word_count = 0.0 word_count = 0.0
batch_times = [] batch_times = []
batch_start = time.time()
for batch_id, batch in enumerate(train_data_iter): for batch_id, batch in enumerate(train_data_iter):
batch_start_time = time.time() batch_reader_end = time.time()
input_data_feed, word_num = prepare_input( input_data_feed, word_num = prepare_input(
batch, epoch_id=epoch_id) batch, epoch_id=epoch_id)
word_count += word_num word_count += word_num
...@@ -164,28 +172,28 @@ def main(): ...@@ -164,28 +172,28 @@ def main():
optimizer.minimize(loss) optimizer.minimize(loss)
model.clear_gradients() model.clear_gradients()
total_loss += loss * batch_size total_loss += loss * batch_size
batch_end_time = time.time()
batch_time = batch_end_time - batch_start_time
batch_times.append(batch_time)
train_batch_cost = time.time() - batch_start
batch_times.append(train_batch_cost)
if batch_id > 0 and batch_id % 100 == 0: if batch_id > 0 and batch_id % 100 == 0:
print("-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f" % print(
(epoch_id, batch_id, batch_time, "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, batch_cost: %.5f s, reader_cost: %.5f s"
np.exp(total_loss.numpy() / word_count))) % (epoch_id, batch_id, np.exp(total_loss.numpy() /
word_count),
train_batch_cost, batch_reader_end - batch_start))
ce_ppl.append(np.exp(total_loss.numpy() / word_count)) ce_ppl.append(np.exp(total_loss.numpy() / word_count))
total_loss = 0.0 total_loss = 0.0
word_count = 0.0 word_count = 0.0
batch_start = time.time()
end_time = time.time() train_epoch_cost = time.time() - epoch_start
epoch_time = end_time - start_time
print( print(
"\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step\n" "\nTrain epoch:[%d]; epoch_cost: %.5f s; avg_batch_cost: %.5f s/step\n"
% (epoch_id, epoch_time, sum(batch_times) / len(batch_times))) % (epoch_id, train_epoch_cost,
ce_time.append(epoch_time) sum(batch_times) / len(batch_times)))
ce_time.append(train_epoch_cost)
dir_name = os.path.join(args.model_path, "epoch_" + str(epoch_id))
dir_name = os.path.join(args.model_path,
"epoch_" + str(epoch_id))
print("begin to save", dir_name) print("begin to save", dir_name)
paddle.fluid.save_dygraph(model.state_dict(), dir_name) paddle.fluid.save_dygraph(model.state_dict(), dir_name)
print("save finished") print("save finished")
......
...@@ -396,7 +396,9 @@ class DataProcessor(object): ...@@ -396,7 +396,9 @@ class DataProcessor(object):
def _load_lines(self, fpattern, tar_fname): def _load_lines(self, fpattern, tar_fname):
fpaths = glob.glob(fpattern) fpaths = glob.glob(fpattern)
assert len(fpaths) > 0, "no matching file to the provided data path" assert len(
fpaths
) > 0, "No matching file to the provided data path, for pattern %s." % fpattern
if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]): if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
if tar_fname is None: if tar_fname is None:
......
...@@ -33,29 +33,30 @@ from model import Transformer, CrossEntropyCriterion, NoamDecay ...@@ -33,29 +33,30 @@ from model import Transformer, CrossEntropyCriterion, NoamDecay
def do_train(args): def do_train(args):
if args.use_cuda: if args.use_cuda:
trainer_count = fluid.dygraph.parallel.Env().nranks trainer_count = fluid.dygraph.parallel.Env().nranks
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
) if trainer_count > 1 else fluid.CUDAPlace(0) ).dev_id) if trainer_count > 1 else fluid.CUDAPlace(0)
else: else:
trainer_count = 1 trainer_count = 1
place = fluid.CPUPlace() place = fluid.CPUPlace()
# define the data generator # define the data generator
processor = reader.DataProcessor(fpattern=args.training_file, processor = reader.DataProcessor(
src_vocab_fpath=args.src_vocab_fpath, fpattern=args.training_file,
trg_vocab_fpath=args.trg_vocab_fpath, src_vocab_fpath=args.src_vocab_fpath,
token_delimiter=args.token_delimiter, trg_vocab_fpath=args.trg_vocab_fpath,
use_token_batch=args.use_token_batch, token_delimiter=args.token_delimiter,
batch_size=args.batch_size, use_token_batch=args.use_token_batch,
device_count=trainer_count, batch_size=args.batch_size,
pool_size=args.pool_size, device_count=trainer_count,
sort_type=args.sort_type, pool_size=args.pool_size,
shuffle=args.shuffle, sort_type=args.sort_type,
shuffle_batch=args.shuffle_batch, shuffle=args.shuffle,
start_mark=args.special_token[0], shuffle_batch=args.shuffle_batch,
end_mark=args.special_token[1], start_mark=args.special_token[0],
unk_mark=args.special_token[2], end_mark=args.special_token[1],
max_length=args.max_length, unk_mark=args.special_token[2],
n_head=args.n_head) max_length=args.max_length,
n_head=args.n_head)
batch_generator = processor.data_generator(phase="train") batch_generator = processor.data_generator(phase="train")
if args.validation_file: if args.validation_file:
val_processor = reader.DataProcessor( val_processor = reader.DataProcessor(
...@@ -131,31 +132,30 @@ def do_train(args): ...@@ -131,31 +132,30 @@ def do_train(args):
if trainer_count > 1: if trainer_count > 1:
strategy = fluid.dygraph.parallel.prepare_context() strategy = fluid.dygraph.parallel.prepare_context()
transformer = fluid.dygraph.parallel.DataParallel( transformer = fluid.dygraph.parallel.DataParallel(transformer,
transformer, strategy) strategy)
# the best cross-entropy value with label smoothing # the best cross-entropy value with label smoothing
loss_normalizer = -( loss_normalizer = -(
(1. - args.label_smooth_eps) * np.log( (1. - args.label_smooth_eps) * np.log(
(1. - args.label_smooth_eps)) + (1. - args.label_smooth_eps)) + args.label_smooth_eps *
args.label_smooth_eps * np.log(args.label_smooth_eps / np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
(args.trg_vocab_size - 1) + 1e-20))
ce_time = [] ce_time = []
ce_ppl = [] ce_ppl = []
step_idx = 0 step_idx = 0
#NOTE: used for benchmark
total_batch_num = 0
# train loop # train loop
for pass_id in range(args.epoch): for pass_id in range(args.epoch):
pass_start_time = time.time() epoch_start = time.time()
batch_id = 0 batch_id = 0
batch_start = time.time()
for input_data in train_loader(): for input_data in train_loader():
if args.max_iter and total_batch_num == args.max_iter: #NOTE: used for benchmark if args.max_iter and step_idx == args.max_iter: #NOTE: used for benchmark
return return
batch_start = time.time() batch_reader_end = time.time()
(src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, trg_slf_attn_bias, trg_src_attn_bias, lbl_word,
lbl_weight) = input_data lbl_weight) = input_data
...@@ -163,8 +163,8 @@ def do_train(args): ...@@ -163,8 +163,8 @@ def do_train(args):
trg_word, trg_pos, trg_slf_attn_bias, trg_word, trg_pos, trg_slf_attn_bias,
trg_src_attn_bias) trg_src_attn_bias)
sum_cost, avg_cost, token_num = criterion( sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
logits, lbl_word, lbl_weight) lbl_weight)
if trainer_count > 1: if trainer_count > 1:
avg_cost = transformer.scale_loss(avg_cost) avg_cost = transformer.scale_loss(avg_cost)
...@@ -184,19 +184,19 @@ def do_train(args): ...@@ -184,19 +184,19 @@ def do_train(args):
"step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
"normalized loss: %f, ppl: %f" % "normalized loss: %f, ppl: %f" %
(step_idx, pass_id, batch_id, total_avg_cost, (step_idx, pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer, total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)]))) np.exp([min(total_avg_cost, 100)])))
avg_batch_time = time.time()
else: else:
train_avg_batch_cost = args.print_step / (
time.time() - batch_start)
logging.info( logging.info(
"step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
"normalized loss: %f, ppl: %f, speed: %.2f step/s" % "normalized loss: %f, ppl: %f, avg_speed: %.2f step/s"
(step_idx, pass_id, batch_id, total_avg_cost, % (step_idx, pass_id, batch_id, total_avg_cost,
total_avg_cost - loss_normalizer, total_avg_cost - loss_normalizer,
np.exp([min(total_avg_cost, 100)]), np.exp([min(total_avg_cost, 100)]),
args.print_step / (time.time() - avg_batch_time))) train_avg_batch_cost))
avg_batch_time = time.time() batch_start = time.time()
if step_idx % args.save_step == 0 and step_idx != 0: if step_idx % args.save_step == 0 and step_idx != 0:
# validation # validation
...@@ -208,10 +208,9 @@ def do_train(args): ...@@ -208,10 +208,9 @@ def do_train(args):
(src_word, src_pos, src_slf_attn_bias, trg_word, (src_word, src_pos, src_slf_attn_bias, trg_word,
trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
lbl_word, lbl_weight) = input_data lbl_word, lbl_weight) = input_data
logits = transformer(src_word, src_pos, logits = transformer(
src_slf_attn_bias, trg_word, src_word, src_pos, src_slf_attn_bias, trg_word,
trg_pos, trg_slf_attn_bias, trg_pos, trg_slf_attn_bias, trg_src_attn_bias)
trg_src_attn_bias)
sum_cost, avg_cost, token_num = criterion( sum_cost, avg_cost, token_num = criterion(
logits, lbl_word, lbl_weight) logits, lbl_word, lbl_weight)
total_sum_cost += sum_cost.numpy() total_sum_cost += sum_cost.numpy()
...@@ -225,8 +224,8 @@ def do_train(args): ...@@ -225,8 +224,8 @@ def do_train(args):
transformer.train() transformer.train()
if args.save_model and ( if args.save_model and (
trainer_count == 1 trainer_count == 1 or
or fluid.dygraph.parallel.Env().dev_id == 0): fluid.dygraph.parallel.Env().dev_id == 0):
model_dir = os.path.join(args.save_model, model_dir = os.path.join(args.save_model,
"step_" + str(step_idx)) "step_" + str(step_idx))
if not os.path.exists(model_dir): if not os.path.exists(model_dir):
...@@ -239,11 +238,12 @@ def do_train(args): ...@@ -239,11 +238,12 @@ def do_train(args):
os.path.join(model_dir, "transformer")) os.path.join(model_dir, "transformer"))
batch_id += 1 batch_id += 1
total_batch_num = total_batch_num + 1
step_idx += 1 step_idx += 1
time_consumed = time.time() - pass_start_time train_epoch_cost = time.time() - epoch_start
ce_time.append(time_consumed) ce_time.append(train_epoch_cost)
logging.info("train epoch: %d, epoch_cost: %.5f s" %
(pass_id, train_epoch_cost))
if args.save_model: if args.save_model:
model_dir = os.path.join(args.save_model, "step_final") model_dir = os.path.join(args.save_model, "step_final")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册