未验证 提交 b7b0b359 编写于 作者: L liym27 提交者: GitHub

Add unittest for transformer prediction in dygraph_to_static (#23207)

* Add unittest for transformer prediction in dygraph_to_static.

* fix bug in fill_constant api. 

* Make transpose support size 0. test=develop
上级 738c8464
...@@ -667,6 +667,9 @@ class TransposeGPUKernel : public framework::OpKernel<T> { ...@@ -667,6 +667,9 @@ class TransposeGPUKernel : public framework::OpKernel<T> {
auto* x = context.Input<framework::Tensor>("X"); auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (out->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis"); std::vector<int> axis = context.Attr<std::vector<int>>("axis");
int ndims = axis.size(); int ndims = axis.size();
...@@ -688,6 +691,9 @@ class TransposeGradGPUKernel : public framework::OpKernel<T> { ...@@ -688,6 +691,9 @@ class TransposeGradGPUKernel : public framework::OpKernel<T> {
if (!x_grad) return; if (!x_grad) return;
x_grad->mutable_data<T>(context.GetPlace()); x_grad->mutable_data<T>(context.GetPlace());
if (x_grad->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis"); std::vector<int> axis = context.Attr<std::vector<int>>("axis");
std::vector<int> reversed_axis(axis); std::vector<int> reversed_axis(axis);
......
...@@ -64,7 +64,9 @@ class TransposeKernel : public framework::OpKernel<T> { ...@@ -64,7 +64,9 @@ class TransposeKernel : public framework::OpKernel<T> {
auto* x = context.Input<framework::Tensor>("X"); auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out"); auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (out->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis"); std::vector<int> axis = context.Attr<std::vector<int>>("axis");
int ndims = axis.size(); int ndims = axis.size();
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
...@@ -83,6 +85,10 @@ class TransposeGradKernel : public framework::OpKernel<T> { ...@@ -83,6 +85,10 @@ class TransposeGradKernel : public framework::OpKernel<T> {
if (!x_grad) return; if (!x_grad) return;
x_grad->mutable_data<T>(context.GetPlace()); x_grad->mutable_data<T>(context.GetPlace());
if (x_grad->numel() == 0) {
return;
}
std::vector<int> axis = context.Attr<std::vector<int>>("axis"); std::vector<int> axis = context.Attr<std::vector<int>>("axis");
std::vector<int> reversed_axis(axis); std::vector<int> reversed_axis(axis);
......
...@@ -594,13 +594,12 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): ...@@ -594,13 +594,12 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
shape)) shape))
else: else:
shape = list(shape.numpy().astype(int)) shape = list(shape.numpy().astype(int))
dtype = convert_np_dtype_to_dtype_(dtype)
if out is None: if out is None:
out = _varbase_creator(dtype=dtype) out = _varbase_creator(dtype=dtype)
core.ops.fill_constant(out, 'value', core.ops.fill_constant(out, 'value',
float(value), 'force_cpu', force_cpu, 'dtype', float(value), 'force_cpu', force_cpu, 'dtype',
dtype, 'str_value', attrs['str_value'], 'shape', out.dtype, 'str_value', attrs['str_value'],
shape) 'shape', shape)
out.stop_gradient = True out.stop_gradient = True
return out return out
......
...@@ -21,6 +21,7 @@ import unittest ...@@ -21,6 +21,7 @@ import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import transformer_util as util import transformer_util as util
from transformer_dygraph_model import position_encoding_init
from transformer_dygraph_model import Transformer from transformer_dygraph_model import Transformer
from transformer_dygraph_model import CrossEntropyCriterion from transformer_dygraph_model import CrossEntropyCriterion
...@@ -31,8 +32,8 @@ SEED = 10 ...@@ -31,8 +32,8 @@ SEED = 10
def train_static(args, batch_generator): def train_static(args, batch_generator):
train_prog = fluid.default_main_program() train_prog = fluid.Program()
startup_prog = fluid.default_startup_program() startup_prog = fluid.Program()
train_prog.random_seed = SEED train_prog.random_seed = SEED
startup_prog.random_seed = SEED startup_prog.random_seed = SEED
with fluid.program_guard(train_prog, startup_prog): with fluid.program_guard(train_prog, startup_prog):
...@@ -117,9 +118,9 @@ def train_static(args, batch_generator): ...@@ -117,9 +118,9 @@ def train_static(args, batch_generator):
step_idx += 1 step_idx += 1
total_batch_num = total_batch_num + 1 total_batch_num = total_batch_num + 1
if step_idx == 10: if step_idx == 10:
if args.save_model: if args.save_dygraph_model_path:
model_path = os.path.join( model_path = os.path.join(args.save_static_model_path,
args.save_model, "step_" + str(step_idx), "transformer") "transformer")
fluid.save(train_prog, model_path) fluid.save(train_prog, model_path)
break break
return np.array(avg_loss) return np.array(avg_loss)
...@@ -201,9 +202,8 @@ def train_dygraph(args, batch_generator): ...@@ -201,9 +202,8 @@ def train_dygraph(args, batch_generator):
batch_id += 1 batch_id += 1
step_idx += 1 step_idx += 1
if step_idx == 10: if step_idx == 10:
if args.save_model: if args.save_dygraph_model_path:
model_dir = os.path.join(args.save_model + '_dygraph', model_dir = os.path.join(args.save_dygraph_model_path)
"step_" + str(step_idx))
if not os.path.exists(model_dir): if not os.path.exists(model_dir):
os.makedirs(model_dir) os.makedirs(model_dir)
fluid.save_dygraph( fluid.save_dygraph(
...@@ -218,18 +218,143 @@ def train_dygraph(args, batch_generator): ...@@ -218,18 +218,143 @@ def train_dygraph(args, batch_generator):
return np.array(avg_loss) return np.array(avg_loss)
def predict_dygraph(args, batch_generator):
with fluid.dygraph.guard(place):
fluid.default_main_program().random_seed = SEED
# define data loader
test_loader = fluid.io.DataLoader.from_generator(capacity=10)
test_loader.set_batch_generator(batch_generator, places=place)
# define model
transformer = Transformer(
args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
args.d_inner_hid, args.prepostprocess_dropout,
args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
args.postprocess_cmd, args.weight_sharing, args.bos_idx,
args.eos_idx)
# load the trained model
model_dict, _ = util.load_dygraph(
os.path.join(args.save_dygraph_model_path, "transformer"))
# to avoid a longer length than training, reset the size of position
# encoding to max_length
model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
args.max_length + 1, args.d_model)
model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
args.max_length + 1, args.d_model)
transformer.load_dict(model_dict)
# set evaluate mode
transformer.eval()
step_idx = 0
for input_data in test_loader():
(src_word, src_pos, src_slf_attn_bias, trg_word,
trg_src_attn_bias) = input_data
finished_seq, finished_scores = transformer.beam_search(
src_word,
src_pos,
src_slf_attn_bias,
trg_word,
trg_src_attn_bias,
bos_id=args.bos_idx,
eos_id=args.eos_idx,
beam_size=args.beam_size,
max_len=args.max_out_len)
finished_seq = finished_seq.numpy()
finished_scores = finished_scores.numpy()
step_idx += 1
if step_idx == 10:
break
return finished_seq
def predict_static(args, batch_generator):
test_prog = fluid.Program()
with fluid.program_guard(test_prog):
test_prog.random_seed = SEED
# define input and reader
input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields
input_descs = util.get_input_descs(args, 'test')
input_slots = [{
"name": name,
"shape": input_descs[name][0],
"dtype": input_descs[name][1]
} for name in input_field_names]
input_field = util.InputField(input_slots)
feed_list = input_field.feed_list
loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list, capacity=10)
# define model
transformer = Transformer(
args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
args.d_inner_hid, args.prepostprocess_dropout,
args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
args.postprocess_cmd, args.weight_sharing, args.bos_idx,
args.eos_idx)
out_ids, out_scores = transformer.beam_search(
*feed_list,
bos_id=args.bos_idx,
eos_id=args.eos_idx,
beam_size=args.beam_size,
max_len=args.max_out_len)
# This is used here to set dropout to the test mode.
test_prog = test_prog.clone(for_test=True)
# define the executor and program for training
exe = fluid.Executor(place)
util.load(test_prog,
os.path.join(args.save_static_model_path, "transformer"), exe)
loader.set_batch_generator(batch_generator, places=place)
step_idx = 0
for feed_dict in loader:
seq_ids, seq_scores = exe.run(
test_prog,
feed=feed_dict,
fetch_list=[out_ids.name, out_scores.name],
return_numpy=True)
step_idx += 1
if step_idx == 10:
break
return seq_ids
class TestTransformer(unittest.TestCase): class TestTransformer(unittest.TestCase):
def prepare(self, mode='train'): def prepare(self, mode='train'):
args = util.ModelHyperParams() args = util.ModelHyperParams()
batch_generator = util.get_feed_data_reader(args, mode) batch_generator = util.get_feed_data_reader(args, mode)
return args, batch_generator return args, batch_generator
def test_train(self): def _test_train(self):
args, batch_generator = self.prepare(mode='train') args, batch_generator = self.prepare(mode='train')
static_avg_loss = train_static(args, batch_generator) static_avg_loss = train_static(args, batch_generator)
dygraph_avg_loss = train_dygraph(args, batch_generator) dygraph_avg_loss = train_dygraph(args, batch_generator)
self.assertTrue(np.allclose(static_avg_loss, dygraph_avg_loss)) self.assertTrue(np.allclose(static_avg_loss, dygraph_avg_loss))
def _test_predict(self):
args, batch_generator = self.prepare(mode='test')
static_res = predict_static(args, batch_generator)
dygraph_res = predict_dygraph(args, batch_generator)
self.assertTrue(
np.allclose(static_res, dygraph_res),
msg="static_res: {} \n dygraph_res: {}".format(static_res,
dygraph_res))
def test_check_result(self):
self._test_train()
self._test_predict()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -18,8 +18,10 @@ import numpy as np ...@@ -18,8 +18,10 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
from paddle.fluid.dygraph.jit import dygraph_to_static_func from paddle.fluid.dygraph.jit import dygraph_to_static_func
from paddle.fluid.layers.utils import map_structure
from paddle.fluid.framework import Program, Block, Variable, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
...@@ -486,3 +488,169 @@ class Transformer(Layer): ...@@ -486,3 +488,169 @@ class Transformer(Layer):
predict = self.decoder(trg_word, trg_pos, trg_slf_attn_bias, predict = self.decoder(trg_word, trg_pos, trg_slf_attn_bias,
trg_src_attn_bias, enc_output) trg_src_attn_bias, enc_output)
return predict return predict
@dygraph_to_static_func
def beam_search(self,
src_word,
src_pos,
src_slf_attn_bias,
trg_word,
trg_src_attn_bias,
bos_id=0,
eos_id=1,
beam_size=4,
max_len=256):
def expand_to_beam_size(tensor, beam_size):
tensor = layers.reshape(
tensor, [tensor.shape[0], 1] + list(tensor.shape[1:]))
tile_dims = [1] * len(tensor.shape)
tile_dims[1] = beam_size
return layers.expand(tensor, tile_dims)
def merge_batch_beams(tensor):
var_dim_in_state = 2 # count in beam dim
tensor = layers.transpose(
tensor,
list(range(var_dim_in_state, len(tensor.shape))) +
list(range(0, var_dim_in_state)))
tensor = layers.reshape(tensor,
[0] * (len(tensor.shape) - var_dim_in_state
) + [batch_size * beam_size])
res = layers.transpose(
tensor,
list(
range((len(tensor.shape) + 1 - var_dim_in_state),
len(tensor.shape))) +
list(range(0, (len(tensor.shape) + 1 - var_dim_in_state))))
return res
def split_batch_beams(tensor):
var_dim_in_state = 1
tensor = layers.transpose(
tensor,
list(range(var_dim_in_state, len(tensor.shape))) +
list(range(0, var_dim_in_state)))
tensor = layers.reshape(tensor,
[0] * (len(tensor.shape) - var_dim_in_state
) + [batch_size, beam_size])
res = layers.transpose(
tensor,
list(
range((len(tensor.shape) - 1 - var_dim_in_state),
len(tensor.shape))) +
list(range(0, (len(tensor.shape) - 1 - var_dim_in_state))))
return res
def mask_probs(probs, finished, noend_mask_tensor):
finished = layers.cast(finished, dtype=probs.dtype)
probs = layers.elementwise_mul(
layers.expand(
layers.unsqueeze(finished, [2]),
[1, 1, self.trg_vocab_size]),
noend_mask_tensor,
axis=-1) - layers.elementwise_mul(
probs, (finished - 1), axis=0)
return probs
def gather(input, indices, batch_pos):
topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
return layers.gather_nd(input, topk_coordinates)
# run encoder
enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)
batch_size = enc_output.shape[0]
# constant number
inf = float(1. * 1e7)
max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
vocab_size_tensor = layers.fill_constant(
shape=[1], dtype="int64", value=self.trg_vocab_size)
end_token_tensor = to_variable(
np.full(
[batch_size, beam_size], eos_id, dtype="int64"))
noend_array = [-inf] * self.trg_vocab_size
noend_array[eos_id] = 0
noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
batch_pos = layers.expand(
layers.unsqueeze(
to_variable(np.arange(
0, batch_size, 1, dtype="int64")), [1]), [1, beam_size])
predict_ids = []
parent_ids = []
### initialize states of beam search ###
log_probs = to_variable(
np.array(
[[0.] + [-inf] * (beam_size - 1)] * batch_size,
dtype="float32"))
finished = fluid.layers.fill_constant(
shape=[batch_size, beam_size], value=0, dtype="bool")
trg_word = layers.fill_constant(
shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id)
trg_src_attn_bias = merge_batch_beams(
expand_to_beam_size(trg_src_attn_bias, beam_size))
enc_output = merge_batch_beams(
expand_to_beam_size(enc_output, beam_size))
# init states (caches) for transformer, need to be updated according to selected beam
caches = [{
"k": layers.fill_constant(
shape=[batch_size, beam_size, self.n_head, 0, self.d_key],
dtype=enc_output.dtype,
value=0),
"v": layers.fill_constant(
shape=[batch_size, beam_size, self.n_head, 0, self.d_value],
dtype=enc_output.dtype,
value=0),
} for i in range(self.n_layer)]
for i in range(max_len):
trg_pos = layers.zeros_like(
trg_word) + i # TODO: modified for dygraph2static
caches = map_structure(merge_batch_beams,
caches) # TODO: modified for dygraph2static
logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
enc_output, caches)
caches = map_structure(split_batch_beams, caches)
step_log_probs = split_batch_beams(
fluid.layers.log(fluid.layers.softmax(logits)))
step_log_probs = mask_probs(step_log_probs, finished,
noend_mask_tensor)
log_probs = layers.elementwise_add(
x=step_log_probs, y=log_probs, axis=0)
log_probs = layers.reshape(log_probs,
[-1, beam_size * self.trg_vocab_size])
scores = log_probs
topk_scores, topk_indices = fluid.layers.topk(
input=scores, k=beam_size)
beam_indices = fluid.layers.elementwise_floordiv(topk_indices,
vocab_size_tensor)
token_indices = fluid.layers.elementwise_mod(topk_indices,
vocab_size_tensor)
# update states
caches = map_structure(lambda x: gather(x, beam_indices, batch_pos),
caches)
log_probs = gather(log_probs, topk_indices, batch_pos)
finished = gather(finished, beam_indices, batch_pos)
finished = layers.logical_or(
finished, layers.equal(token_indices, end_token_tensor))
trg_word = layers.reshape(token_indices, [-1, 1])
predict_ids.append(token_indices)
parent_ids.append(beam_indices)
if layers.reduce_all(finished).numpy():
break
predict_ids = layers.stack(predict_ids, axis=0)
parent_ids = layers.stack(parent_ids, axis=0)
finished_seq = layers.transpose(
layers.gather_tree(predict_ids, parent_ids), [1, 2, 0])
finished_scores = topk_scores
return finished_seq, finished_scores
...@@ -23,13 +23,13 @@ import paddle.fluid as fluid ...@@ -23,13 +23,13 @@ import paddle.fluid as fluid
import paddle.dataset.wmt16 as wmt16 import paddle.dataset.wmt16 as wmt16
def get_input_descs(args): def get_input_descs(args, mode="train"):
batch_size = args.batch_size # TODO None(before) batch_size = args.batch_size # TODO None(before)
seq_len = None seq_len = None
n_head = getattr(args, "n_head", 8) n_head = getattr(args, "n_head", 8)
d_model = getattr(args, "d_model", 512) d_model = getattr(args, "d_model", 512)
input_descs = { input_descs_train = {
"src_word": [(batch_size, seq_len), "int64", 2], "src_word": [(batch_size, seq_len), "int64", 2],
"src_pos": [(batch_size, seq_len), "int64"], "src_pos": [(batch_size, seq_len), "int64"],
"src_slf_attn_bias": "src_slf_attn_bias":
...@@ -46,7 +46,24 @@ def get_input_descs(args): ...@@ -46,7 +46,24 @@ def get_input_descs(args):
"init_score": [(batch_size, 1), "float32", 2], "init_score": [(batch_size, 1), "float32", 2],
"init_idx": [(batch_size, ), "int32"], "init_idx": [(batch_size, ), "int32"],
} }
return input_descs input_descs_predict = {
"src_word": [(batch_size, seq_len), "int64", 2],
"src_pos": [(batch_size, seq_len), "int64"],
"src_slf_attn_bias":
[(batch_size, n_head, seq_len, seq_len), "float32"],
"trg_word": [(batch_size, seq_len), "int64", 2],
"trg_pos": [(batch_size, seq_len), "int64"],
"trg_slf_attn_bias":
[(batch_size, n_head, seq_len, seq_len), "float32"],
"trg_src_attn_bias": [(batch_size, n_head, 1, seq_len), "float32"],
"enc_output": [(batch_size, seq_len, d_model), "float32"],
"lbl_word": [(None, 1), "int64"],
"lbl_weight": [(None, 1), "float32"],
"init_score": [(batch_size, 1), "float32", 2],
"init_idx": [(batch_size, ), "int32"],
}
return input_descs_train if mode == "train" else input_descs_predict
encoder_data_input_fields = ( encoder_data_input_fields = (
...@@ -69,8 +86,8 @@ fast_decoder_data_input_fields = ( ...@@ -69,8 +86,8 @@ fast_decoder_data_input_fields = (
class ModelHyperParams(object): class ModelHyperParams(object):
print_step = 2 print_step = 2
init_from_params = "trained_models/step_10/" save_dygraph_model_path = "dygraph_trained_models"
save_model = "trained_models" save_static_model_path = "static_trained_models"
inference_model_dir = "infer_model" inference_model_dir = "infer_model"
output_file = "predict.txt" output_file = "predict.txt"
batch_size = 5 batch_size = 5
...@@ -82,10 +99,10 @@ class ModelHyperParams(object): ...@@ -82,10 +99,10 @@ class ModelHyperParams(object):
warmup_steps = 8000 warmup_steps = 8000
label_smooth_eps = 0.1 label_smooth_eps = 0.1
beam_size = 5 beam_size = 5
max_out_len = 256 max_out_len = 5 # small number to avoid the unittest timeout
n_best = 1 n_best = 1
src_vocab_size = 10000 src_vocab_size = 36556
trg_vocab_size = 10000 trg_vocab_size = 36556
bos_idx = 0 # index for <bos> token bos_idx = 0 # index for <bos> token
eos_idx = 1 # index for <eos> token eos_idx = 1 # index for <eos> token
unk_idx = 2 # index for <unk> token unk_idx = 2 # index for <unk> token
...@@ -214,7 +231,7 @@ def get_feed_data_reader(args, mode='train'): ...@@ -214,7 +231,7 @@ def get_feed_data_reader(args, mode='train'):
def __for_test__(): def __for_test__():
test_reader = paddle.batch( test_reader = paddle.batch(
wmt16.train(args.src_vocab_size, args.trg_vocab_size), wmt16.test(args.src_vocab_size, args.trg_vocab_size),
batch_size=args.batch_size) batch_size=args.batch_size)
for batch in test_reader(): for batch in test_reader():
tensors = prepare_infer_input(batch, args.eos_idx, args.eos_idx, tensors = prepare_infer_input(batch, args.eos_idx, args.eos_idx,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册