未验证 提交 b7b0b359 编写于 作者: L liym27 提交者: GitHub

Add unittest for transformer prediction in dygraph_to_static (#23207)

* Add unittest for transformer prediction in dygraph_to_static.

* fix bug in fill_constant api. 

* Make transpose support size 0. test=develop
上级 738c8464
......@@ -667,6 +667,9 @@ class TransposeGPUKernel : public framework::OpKernel<T> {
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
if (out->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis");
int ndims = axis.size();
......@@ -688,6 +691,9 @@ class TransposeGradGPUKernel : public framework::OpKernel<T> {
if (!x_grad) return;
x_grad->mutable_data<T>(context.GetPlace());
if (x_grad->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis");
std::vector<int> reversed_axis(axis);
......
......@@ -64,7 +64,9 @@ class TransposeKernel : public framework::OpKernel<T> {
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
if (out->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis");
int ndims = axis.size();
auto& dev_ctx = context.template device_context<DeviceContext>();
......@@ -83,6 +85,10 @@ class TransposeGradKernel : public framework::OpKernel<T> {
if (!x_grad) return;
x_grad->mutable_data<T>(context.GetPlace());
if (x_grad->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis");
std::vector<int> reversed_axis(axis);
......
......@@ -594,13 +594,12 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
shape))
else:
shape = list(shape.numpy().astype(int))
dtype = convert_np_dtype_to_dtype_(dtype)
if out is None:
out = _varbase_creator(dtype=dtype)
core.ops.fill_constant(out, 'value',
float(value), 'force_cpu', force_cpu, 'dtype',
dtype, 'str_value', attrs['str_value'], 'shape',
shape)
out.dtype, 'str_value', attrs['str_value'],
'shape', shape)
out.stop_gradient = True
return out
......
......@@ -21,6 +21,7 @@ import unittest
import paddle.fluid as fluid
import transformer_util as util
from transformer_dygraph_model import position_encoding_init
from transformer_dygraph_model import Transformer
from transformer_dygraph_model import CrossEntropyCriterion
......@@ -31,8 +32,8 @@ SEED = 10
def train_static(args, batch_generator):
train_prog = fluid.default_main_program()
startup_prog = fluid.default_startup_program()
train_prog = fluid.Program()
startup_prog = fluid.Program()
train_prog.random_seed = SEED
startup_prog.random_seed = SEED
with fluid.program_guard(train_prog, startup_prog):
......@@ -117,9 +118,9 @@ def train_static(args, batch_generator):
step_idx += 1
total_batch_num = total_batch_num + 1
if step_idx == 10:
if args.save_model:
model_path = os.path.join(
args.save_model, "step_" + str(step_idx), "transformer")
if args.save_dygraph_model_path:
model_path = os.path.join(args.save_static_model_path,
"transformer")
fluid.save(train_prog, model_path)
break
return np.array(avg_loss)
......@@ -201,9 +202,8 @@ def train_dygraph(args, batch_generator):
batch_id += 1
step_idx += 1
if step_idx == 10:
if args.save_model:
model_dir = os.path.join(args.save_model + '_dygraph',
"step_" + str(step_idx))
if args.save_dygraph_model_path:
model_dir = os.path.join(args.save_dygraph_model_path)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
fluid.save_dygraph(
......@@ -218,18 +218,143 @@ def train_dygraph(args, batch_generator):
return np.array(avg_loss)
def predict_dygraph(args, batch_generator):
with fluid.dygraph.guard(place):
fluid.default_main_program().random_seed = SEED
# define data loader
test_loader = fluid.io.DataLoader.from_generator(capacity=10)
test_loader.set_batch_generator(batch_generator, places=place)
# define model
transformer = Transformer(
args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
args.d_inner_hid, args.prepostprocess_dropout,
args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
args.postprocess_cmd, args.weight_sharing, args.bos_idx,
args.eos_idx)
# load the trained model
model_dict, _ = util.load_dygraph(
os.path.join(args.save_dygraph_model_path, "transformer"))
# to avoid a longer length than training, reset the size of position
# encoding to max_length
model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
args.max_length + 1, args.d_model)
model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
args.max_length + 1, args.d_model)
transformer.load_dict(model_dict)
# set evaluate mode
transformer.eval()
step_idx = 0
for input_data in test_loader():
(src_word, src_pos, src_slf_attn_bias, trg_word,
trg_src_attn_bias) = input_data
finished_seq, finished_scores = transformer.beam_search(
src_word,
src_pos,
src_slf_attn_bias,
trg_word,
trg_src_attn_bias,
bos_id=args.bos_idx,
eos_id=args.eos_idx,
beam_size=args.beam_size,
max_len=args.max_out_len)
finished_seq = finished_seq.numpy()
finished_scores = finished_scores.numpy()
step_idx += 1
if step_idx == 10:
break
return finished_seq
def predict_static(args, batch_generator):
test_prog = fluid.Program()
with fluid.program_guard(test_prog):
test_prog.random_seed = SEED
# define input and reader
input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields
input_descs = util.get_input_descs(args, 'test')
input_slots = [{
"name": name,
"shape": input_descs[name][0],
"dtype": input_descs[name][1]
} for name in input_field_names]
input_field = util.InputField(input_slots)
feed_list = input_field.feed_list
loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list, capacity=10)
# define model
transformer = Transformer(
args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
args.d_inner_hid, args.prepostprocess_dropout,
args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
args.postprocess_cmd, args.weight_sharing, args.bos_idx,
args.eos_idx)
out_ids, out_scores = transformer.beam_search(
*feed_list,
bos_id=args.bos_idx,
eos_id=args.eos_idx,
beam_size=args.beam_size,
max_len=args.max_out_len)
# This is used here to set dropout to the test mode.
test_prog = test_prog.clone(for_test=True)
# define the executor and program for training
exe = fluid.Executor(place)
util.load(test_prog,
os.path.join(args.save_static_model_path, "transformer"), exe)
loader.set_batch_generator(batch_generator, places=place)
step_idx = 0
for feed_dict in loader:
seq_ids, seq_scores = exe.run(
test_prog,
feed=feed_dict,
fetch_list=[out_ids.name, out_scores.name],
return_numpy=True)
step_idx += 1
if step_idx == 10:
break
return seq_ids
class TestTransformer(unittest.TestCase):
def prepare(self, mode='train'):
args = util.ModelHyperParams()
batch_generator = util.get_feed_data_reader(args, mode)
return args, batch_generator
def test_train(self):
def _test_train(self):
args, batch_generator = self.prepare(mode='train')
static_avg_loss = train_static(args, batch_generator)
dygraph_avg_loss = train_dygraph(args, batch_generator)
self.assertTrue(np.allclose(static_avg_loss, dygraph_avg_loss))
def _test_predict(self):
args, batch_generator = self.prepare(mode='test')
static_res = predict_static(args, batch_generator)
dygraph_res = predict_dygraph(args, batch_generator)
self.assertTrue(
np.allclose(static_res, dygraph_res),
msg="static_res: {} \n dygraph_res: {}".format(static_res,
dygraph_res))
def test_check_result(self):
self._test_train()
self._test_predict()
if __name__ == '__main__':
unittest.main()
......@@ -18,8 +18,10 @@ import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
from paddle.fluid.dygraph.jit import dygraph_to_static_func
from paddle.fluid.layers.utils import map_structure
from paddle.fluid.framework import Program, Block, Variable, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
def position_encoding_init(n_position, d_pos_vec):
......@@ -486,3 +488,169 @@ class Transformer(Layer):
predict = self.decoder(trg_word, trg_pos, trg_slf_attn_bias,
trg_src_attn_bias, enc_output)
return predict
@dygraph_to_static_func
def beam_search(self,
src_word,
src_pos,
src_slf_attn_bias,
trg_word,
trg_src_attn_bias,
bos_id=0,
eos_id=1,
beam_size=4,
max_len=256):
def expand_to_beam_size(tensor, beam_size):
tensor = layers.reshape(
tensor, [tensor.shape[0], 1] + list(tensor.shape[1:]))
tile_dims = [1] * len(tensor.shape)
tile_dims[1] = beam_size
return layers.expand(tensor, tile_dims)
def merge_batch_beams(tensor):
var_dim_in_state = 2 # count in beam dim
tensor = layers.transpose(
tensor,
list(range(var_dim_in_state, len(tensor.shape))) +
list(range(0, var_dim_in_state)))
tensor = layers.reshape(tensor,
[0] * (len(tensor.shape) - var_dim_in_state
) + [batch_size * beam_size])
res = layers.transpose(
tensor,
list(
range((len(tensor.shape) + 1 - var_dim_in_state),
len(tensor.shape))) +
list(range(0, (len(tensor.shape) + 1 - var_dim_in_state))))
return res
def split_batch_beams(tensor):
var_dim_in_state = 1
tensor = layers.transpose(
tensor,
list(range(var_dim_in_state, len(tensor.shape))) +
list(range(0, var_dim_in_state)))
tensor = layers.reshape(tensor,
[0] * (len(tensor.shape) - var_dim_in_state
) + [batch_size, beam_size])
res = layers.transpose(
tensor,
list(
range((len(tensor.shape) - 1 - var_dim_in_state),
len(tensor.shape))) +
list(range(0, (len(tensor.shape) - 1 - var_dim_in_state))))
return res
def mask_probs(probs, finished, noend_mask_tensor):
finished = layers.cast(finished, dtype=probs.dtype)
probs = layers.elementwise_mul(
layers.expand(
layers.unsqueeze(finished, [2]),
[1, 1, self.trg_vocab_size]),
noend_mask_tensor,
axis=-1) - layers.elementwise_mul(
probs, (finished - 1), axis=0)
return probs
def gather(input, indices, batch_pos):
topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
return layers.gather_nd(input, topk_coordinates)
# run encoder
enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)
batch_size = enc_output.shape[0]
# constant number
inf = float(1. * 1e7)
max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
vocab_size_tensor = layers.fill_constant(
shape=[1], dtype="int64", value=self.trg_vocab_size)
end_token_tensor = to_variable(
np.full(
[batch_size, beam_size], eos_id, dtype="int64"))
noend_array = [-inf] * self.trg_vocab_size
noend_array[eos_id] = 0
noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
batch_pos = layers.expand(
layers.unsqueeze(
to_variable(np.arange(
0, batch_size, 1, dtype="int64")), [1]), [1, beam_size])
predict_ids = []
parent_ids = []
### initialize states of beam search ###
log_probs = to_variable(
np.array(
[[0.] + [-inf] * (beam_size - 1)] * batch_size,
dtype="float32"))
finished = fluid.layers.fill_constant(
shape=[batch_size, beam_size], value=0, dtype="bool")
trg_word = layers.fill_constant(
shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id)
trg_src_attn_bias = merge_batch_beams(
expand_to_beam_size(trg_src_attn_bias, beam_size))
enc_output = merge_batch_beams(
expand_to_beam_size(enc_output, beam_size))
# init states (caches) for transformer, need to be updated according to selected beam
caches = [{
"k": layers.fill_constant(
shape=[batch_size, beam_size, self.n_head, 0, self.d_key],
dtype=enc_output.dtype,
value=0),
"v": layers.fill_constant(
shape=[batch_size, beam_size, self.n_head, 0, self.d_value],
dtype=enc_output.dtype,
value=0),
} for i in range(self.n_layer)]
for i in range(max_len):
trg_pos = layers.zeros_like(
trg_word) + i # TODO: modified for dygraph2static
caches = map_structure(merge_batch_beams,
caches) # TODO: modified for dygraph2static
logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
enc_output, caches)
caches = map_structure(split_batch_beams, caches)
step_log_probs = split_batch_beams(
fluid.layers.log(fluid.layers.softmax(logits)))
step_log_probs = mask_probs(step_log_probs, finished,
noend_mask_tensor)
log_probs = layers.elementwise_add(
x=step_log_probs, y=log_probs, axis=0)
log_probs = layers.reshape(log_probs,
[-1, beam_size * self.trg_vocab_size])
scores = log_probs
topk_scores, topk_indices = fluid.layers.topk(
input=scores, k=beam_size)
beam_indices = fluid.layers.elementwise_floordiv(topk_indices,
vocab_size_tensor)
token_indices = fluid.layers.elementwise_mod(topk_indices,
vocab_size_tensor)
# update states
caches = map_structure(lambda x: gather(x, beam_indices, batch_pos),
caches)
log_probs = gather(log_probs, topk_indices, batch_pos)
finished = gather(finished, beam_indices, batch_pos)
finished = layers.logical_or(
finished, layers.equal(token_indices, end_token_tensor))
trg_word = layers.reshape(token_indices, [-1, 1])
predict_ids.append(token_indices)
parent_ids.append(beam_indices)
if layers.reduce_all(finished).numpy():
break
predict_ids = layers.stack(predict_ids, axis=0)
parent_ids = layers.stack(parent_ids, axis=0)
finished_seq = layers.transpose(
layers.gather_tree(predict_ids, parent_ids), [1, 2, 0])
finished_scores = topk_scores
return finished_seq, finished_scores
......@@ -23,13 +23,13 @@ import paddle.fluid as fluid
import paddle.dataset.wmt16 as wmt16
def get_input_descs(args):
def get_input_descs(args, mode="train"):
batch_size = args.batch_size # TODO None(before)
seq_len = None
n_head = getattr(args, "n_head", 8)
d_model = getattr(args, "d_model", 512)
input_descs = {
input_descs_train = {
"src_word": [(batch_size, seq_len), "int64", 2],
"src_pos": [(batch_size, seq_len), "int64"],
"src_slf_attn_bias":
......@@ -46,7 +46,24 @@ def get_input_descs(args):
"init_score": [(batch_size, 1), "float32", 2],
"init_idx": [(batch_size, ), "int32"],
}
return input_descs
input_descs_predict = {
"src_word": [(batch_size, seq_len), "int64", 2],
"src_pos": [(batch_size, seq_len), "int64"],
"src_slf_attn_bias":
[(batch_size, n_head, seq_len, seq_len), "float32"],
"trg_word": [(batch_size, seq_len), "int64", 2],
"trg_pos": [(batch_size, seq_len), "int64"],
"trg_slf_attn_bias":
[(batch_size, n_head, seq_len, seq_len), "float32"],
"trg_src_attn_bias": [(batch_size, n_head, 1, seq_len), "float32"],
"enc_output": [(batch_size, seq_len, d_model), "float32"],
"lbl_word": [(None, 1), "int64"],
"lbl_weight": [(None, 1), "float32"],
"init_score": [(batch_size, 1), "float32", 2],
"init_idx": [(batch_size, ), "int32"],
}
return input_descs_train if mode == "train" else input_descs_predict
encoder_data_input_fields = (
......@@ -69,8 +86,8 @@ fast_decoder_data_input_fields = (
class ModelHyperParams(object):
print_step = 2
init_from_params = "trained_models/step_10/"
save_model = "trained_models"
save_dygraph_model_path = "dygraph_trained_models"
save_static_model_path = "static_trained_models"
inference_model_dir = "infer_model"
output_file = "predict.txt"
batch_size = 5
......@@ -82,10 +99,10 @@ class ModelHyperParams(object):
warmup_steps = 8000
label_smooth_eps = 0.1
beam_size = 5
max_out_len = 256
max_out_len = 5 # small number to avoid the unittest timeout
n_best = 1
src_vocab_size = 10000
trg_vocab_size = 10000
src_vocab_size = 36556
trg_vocab_size = 36556
bos_idx = 0 # index for <bos> token
eos_idx = 1 # index for <eos> token
unk_idx = 2 # index for <unk> token
......@@ -214,7 +231,7 @@ def get_feed_data_reader(args, mode='train'):
def __for_test__():
test_reader = paddle.batch(
wmt16.train(args.src_vocab_size, args.trg_vocab_size),
wmt16.test(args.src_vocab_size, args.trg_vocab_size),
batch_size=args.batch_size)
for batch in test_reader():
tensors = prepare_infer_input(batch, args.eos_idx, args.eos_idx,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册