未验证 提交 e15f205a 编写于 作者: G Guo Sheng 提交者: GitHub

Update PaddleMT to Paddle-1.7 API (#4302)

* Update PaddleMT/transformer to use new while_loop API.

* Updata PaddleMT/transformer to new save/load API.

* Change d_key to d_value in cached keys.

* Update PaddleMT/transformer inference_model to use new load API.

* Make Transformer support to load py2 saved models.
上级 e17aa944
......@@ -106,7 +106,7 @@ python -u main.py \
--prepostprocess_dropout 0.3
```
训练时默认使用所有 GPU,可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--use_cuda False` 设置),训练速度相对较慢。在执行训练时若提供了 `save_param``save_checkpoint`(默认为 trained_params 和 trained_ckpts),则每隔一定 iteration 后(通过参数 `save_step` 设置,默认为10000)将分别保存当前训练的参数值和 checkpoint 到相应目录,每隔一定数目的 iteration (通过参数 `print_step` 设置,默认为100)将打印如下的日志到标准输出:
训练时默认使用所有 GPU,可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--use_cuda False` 设置),训练速度相对较慢。在执行训练时若提供了 `save_model_path`(默认为 saved_models),则每隔一定 iteration 后(通过参数 `save_step` 设置,默认为10000)将保存当前训练的 checkpoint 到相应目录(会保存分别记录了模型参数和优化器状态的 `transformer.pdparams``transformer.pdopt` 两个文件),每隔一定数目的 iteration (通过参数 `print_step` 设置,默认为100)将打印如下的日志到标准输出:
```txt
[2019-08-02 15:30:51,656 INFO train.py:262] step_idx: 150100, epoch: 32, batch: 1364, avg loss: 2.880427, normalized loss: 1.504687, ppl: 17.821888, speed: 3.34 step/s
......@@ -195,7 +195,7 @@ BLEU = 26.35, 57.7/32.1/20.0/13.0 (BP=1.000, ratio=1.013, hyp_len=63903, ref_len
### 预训练模型
我们这里提供了对应有以上 BLEU 值的 [base model](https://transformer-res.bj.bcebos.com/base_model_params.tar.gz)[big model](https://transformer-res.bj.bcebos.com/big_model_params.tar.gz) 的模型参数提供下载使用(注意,模型使用了提供下载的数据进行训练和测试)。
我们这里提供了对应有以上 BLEU 值的 [base model](https://transformer-res.bj.bcebos.com/base_model_graph.tar.gz)[big model](https://transformer-res.bj.bcebos.com/big_model_graph.tar.gz) 的模型参数提供下载使用(注意,模型使用了提供下载的数据进行训练和测试)。
## 进阶使用
......
......@@ -24,6 +24,7 @@ import paddle.fluid as fluid
from utils.input_field import InputField
from utils.configure import PDConfig
from utils.load import load
# include task-specific libs
import desc
......@@ -31,51 +32,6 @@ import reader
from transformer import create_net
def init_from_pretrain_model(args, exe, program):
assert isinstance(args.init_from_pretrain_model, str)
if not os.path.exists(args.init_from_pretrain_model):
raise Warning("The pretrained params do not exist.")
return False
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(
os.path.join(args.init_from_pretrain_model, var.name))
fluid.io.load_vars(
exe,
args.init_from_pretrain_model,
main_program=program,
predicate=existed_params)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
return True
def init_from_params(args, exe, program):
assert isinstance(args.init_from_params, str)
if not os.path.exists(args.init_from_params):
raise Warning("the params path does not exist.")
return False
fluid.io.load_params(
executor=exe,
dirname=args.init_from_params,
main_program=program,
filename="params.pdparams")
print("finish init model from params from %s" % (args.init_from_params))
return True
def do_save_inference_model(args):
if args.use_cuda:
dev_count = fluid.core.get_cuda_device_count()
......@@ -84,6 +40,11 @@ def do_save_inference_model(args):
dev_count = int(os.environ.get('CPU_NUM', 1))
place = fluid.CPUPlace()
src_vocab = reader.DataProcessor.load_dict(args.src_vocab_fpath)
trg_vocab = reader.DataProcessor.load_dict(args.trg_vocab_fpath)
args.src_vocab_size = len(src_vocab)
args.trg_vocab_size = len(trg_vocab)
test_prog = fluid.default_main_program()
startup_prog = fluid.default_startup_program()
......@@ -119,24 +80,20 @@ def do_save_inference_model(args):
exe = fluid.Executor(place)
exe.run(startup_prog)
assert (args.init_from_params) or (args.init_from_pretrain_model)
if args.init_from_params:
init_from_params(args, exe, test_prog)
elif args.init_from_pretrain_model:
init_from_pretrain_model(args, exe, test_prog)
assert (
args.init_from_params), "must set init_from_params to load parameters"
load(test_prog, os.path.join(args.init_from_params, "transformer"), exe)
print("finish initing model from params from %s" % (args.init_from_params))
# saving inference model
fluid.io.save_inference_model(
args.inference_model_dir,
feeded_var_names=input_field_names,
target_vars=[out_ids, out_scores],
executor=exe,
main_program=test_prog,
model_filename="model.pdmodel",
params_filename="params.pdparams")
fluid.io.save_inference_model(args.inference_model_dir,
feeded_var_names=list(input_field_names),
target_vars=[out_ids, out_scores],
executor=exe,
main_program=test_prog,
model_filename="model.pdmodel",
params_filename="params.pdparams")
print("save inference model at %s" % (args.inference_model_dir))
......
......@@ -25,6 +25,7 @@ import paddle.fluid as fluid
from utils.input_field import InputField
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
from utils.load import load
# include task-specific libs
import desc
......@@ -32,51 +33,6 @@ import reader
from transformer import create_net, position_encoding_init
def init_from_pretrain_model(args, exe, program):
assert isinstance(args.init_from_pretrain_model, str)
if not os.path.exists(args.init_from_pretrain_model):
raise Warning("The pretrained params do not exist.")
return False
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(
os.path.join(args.init_from_pretrain_model, var.name))
fluid.io.load_vars(
exe,
args.init_from_pretrain_model,
main_program=program,
predicate=existed_params)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
return True
def init_from_params(args, exe, program):
assert isinstance(args.init_from_params, str)
if not os.path.exists(args.init_from_params):
raise Warning("the params path does not exist.")
return False
fluid.io.load_params(
executor=exe,
dirname=args.init_from_params,
main_program=program,
filename="params.pdparams")
print("finish init model from params from %s" % (args.init_from_params))
return True
def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False):
"""
Post-process the beam-search decoded sequence. Truncate from the first
......@@ -160,13 +116,10 @@ def do_predict(args):
exe = fluid.Executor(place)
exe.run(startup_prog)
assert (args.init_from_params) or (args.init_from_pretrain_model)
if args.init_from_params:
init_from_params(args, exe, test_prog)
elif args.init_from_pretrain_model:
init_from_pretrain_model(args, exe, test_prog)
assert (
args.init_from_params), "must set init_from_params to load parameters"
load(test_prog, os.path.join(args.init_from_params, "transformer"), exe)
print("finish initing model from params from %s" % (args.init_from_params))
# to avoid a longer length than training, reset the size of position encoding to max_length
for pos_enc_param_name in desc.pos_enc_param_names:
......
......@@ -27,6 +27,7 @@ import utils.dist_utils as dist_utils
from utils.input_field import InputField
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
from utils.load import load
# include task-specific libs
import desc
......@@ -39,91 +40,6 @@ if os.environ.get('FLAGS_eager_delete_tensor_gb', None) is None:
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
def init_from_pretrain_model(args, exe, program):
assert isinstance(args.init_from_pretrain_model, str)
if not os.path.exists(args.init_from_pretrain_model):
raise Warning("The pretrained params do not exist.")
return False
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(
os.path.join(args.init_from_pretrain_model, var.name))
fluid.io.load_vars(
exe,
args.init_from_pretrain_model,
main_program=program,
predicate=existed_params)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
return True
def init_from_checkpoint(args, exe, program):
assert isinstance(args.init_from_checkpoint, str)
if not os.path.exists(args.init_from_checkpoint):
raise Warning("the checkpoint path does not exist.")
return False
fluid.io.load_persistables(
executor=exe,
dirname=args.init_from_checkpoint,
main_program=program,
filename="checkpoint.pdckpt")
print("finish initing model from checkpoint from %s" %
(args.init_from_checkpoint))
return True
def save_checkpoint(args, exe, program, dirname):
assert isinstance(args.save_model_path, str)
checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint)
if not os.path.exists(checkpoint_dir):
os.mkdir(checkpoint_dir)
fluid.io.save_persistables(
exe,
os.path.join(checkpoint_dir, dirname),
main_program=program,
filename="checkpoint.pdparams")
print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname)))
return True
def save_param(args, exe, program, dirname):
assert isinstance(args.save_model_path, str)
param_dir = os.path.join(args.save_model_path, args.save_param)
if not os.path.exists(param_dir):
os.mkdir(param_dir)
fluid.io.save_params(
exe,
os.path.join(param_dir, dirname),
main_program=program,
filename="params.pdparams")
print("save parameters at %s" % (os.path.join(param_dir, dirname)))
return True
def do_train(args):
if args.use_cuda:
if num_trainers > 1: # for multi-process gpu training
......@@ -226,11 +142,17 @@ def do_train(args):
## init from some checkpoint, to resume the previous training
if args.init_from_checkpoint:
init_from_checkpoint(args, exe, train_prog)
load(train_prog, os.path.join(args.init_from_checkpoint, "transformer"),
exe)
print("finish initing model from checkpoint from %s" %
(args.init_from_checkpoint))
## init from some pretrain models, to better solve the current task
if args.init_from_pretrain_model:
init_from_pretrain_model(args, exe, train_prog)
load(train_prog,
os.path.join(args.init_from_pretrain_model, "transformer"), exe)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
build_strategy = fluid.compiler.BuildStrategy()
build_strategy.enable_inplace = True
......@@ -293,14 +215,12 @@ def do_train(args):
avg_batch_time = time.time()
if step_idx % args.save_step == 0 and step_idx != 0:
if args.save_model_path:
model_path = os.path.join(args.save_model_path,
"step_" + str(step_idx),
"transformer")
fluid.save(train_prog, model_path)
if args.save_checkpoint:
save_checkpoint(args, exe, train_prog,
"step_" + str(step_idx))
if args.save_param:
save_param(args, exe, train_prog,
"step_" + str(step_idx))
batch_id += 1
step_idx += 1
......@@ -319,11 +239,10 @@ def do_train(args):
time_consumed = time.time() - pass_start_time
if args.save_checkpoint:
save_checkpoint(args, exe, train_prog, "step_final")
if args.save_param:
save_param(args, exe, train_prog, "step_final")
if args.save_model_path:
model_path = os.path.join(args.save_model_path, "step_final",
"transformer")
fluid.save(train_prog, model_path)
if args.enable_ce: # For CE
print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost))
......
......@@ -17,6 +17,7 @@ import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layers.utils import map_structure
from desc import *
......@@ -24,6 +25,7 @@ from desc import *
dropout_seed = None
def wrap_layer_with_block(layer, block_idx):
"""
Make layer define support indicating block, by which we can add layers
......@@ -90,7 +92,6 @@ def multi_head_attention(queries,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
......@@ -161,30 +162,28 @@ def multi_head_attention(queries,
v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3])
if cache is not None: # only for faster inference
cache_, i = cache
if static_kv: # For encoder-decoder attention in inference
cache_k, cache_v = cache["static_k"], cache["static_v"]
# To init the static_k and static_v in cache.
# Maybe we can use condition_op(if_else) to do these at the first
# step in while loop to replace these, however it might be less
# efficient.
cache_k, cache_v = cache_["static_k"], cache_["static_v"]
# To init the static_k and static_v in global block.
static_cache_init = wrap_layer_with_block(
layers.assign,
fluid.default_main_program().current_block().parent_idx)
static_cache_init(k, cache_k)
static_cache_init(v, cache_v)
static_cache_init(
k,
fluid.default_main_program().global_block().var(
"static_k_%d" % i))
static_cache_init(
v,
fluid.default_main_program().global_block().var(
"static_v_%d" % i))
k, v = cache_k, cache_v
else: # For decoder self-attention in inference
cache_k, cache_v = cache["k"], cache["v"]
# gather cell states corresponding to selected parent
select_k = layers.gather(cache_k, index=gather_idx)
select_v = layers.gather(cache_v, index=gather_idx)
if not static_kv:
# For self attention in inference, use cache and concat time steps.
select_k = layers.concat([select_k, k], axis=2)
select_v = layers.concat([select_v, v], axis=2)
# update cell states(caches) cached in global block
layers.assign(select_k, cache_k)
layers.assign(select_v, cache_v)
return q, select_k, select_v
# use cache and concat time steps.
cache_k, cache_v = cache_["k"], cache_["v"]
k = layers.concat([cache_k, k], axis=2)
v = layers.concat([cache_v, v], axis=2)
cache_["k"], cache_["v"] = (k, v)
return q, k, v
def __combine_heads(x):
......@@ -405,8 +404,7 @@ def decoder_layer(dec_input,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None,
gather_idx=None):
cache=None):
""" The layer to be stacked in decoder part.
The structure of this module is similar to that in the encoder part except
a multi-head attention is added to implement encoder-decoder attention.
......@@ -421,8 +419,7 @@ def decoder_layer(dec_input,
d_model,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx)
cache=cache)
slf_attn_output = post_process_layer(
dec_input,
slf_attn_output,
......@@ -440,7 +437,6 @@ def decoder_layer(dec_input,
n_head,
attention_dropout,
cache=cache,
gather_idx=gather_idx,
static_kv=True)
enc_attn_output = post_process_layer(
slf_attn_output,
......@@ -476,29 +472,27 @@ def decoder(dec_input,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=None,
gather_idx=None):
caches=None):
"""
The decoder is composed of a stack of identical decoder_layer layers.
"""
for i in range(n_layer):
dec_output = decoder_layer(
dec_input,
enc_output,
dec_slf_attn_bias,
dec_enc_attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else caches[i],
gather_idx=gather_idx)
dec_output = decoder_layer(dec_input,
enc_output,
dec_slf_attn_bias,
dec_enc_attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
cache=None if caches is None else
(caches[i], i))
dec_input = dec_output
dec_output = pre_process_layer(dec_output, preprocess_cmd,
prepostprocess_dropout)
......@@ -654,7 +648,6 @@ def wrap_decoder(dec_inputs,
weight_sharing,
enc_output=None,
caches=None,
gather_idx=None,
bos_idx=0):
"""
The wrapper assembles together all needed layers for the decoder.
......@@ -687,8 +680,7 @@ def wrap_decoder(dec_inputs,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
caches=caches,
gather_idx=gather_idx)
caches=caches)
# Reshape to 2D tensor to use GEMM instead of BatchedGEMM
dec_output = layers.reshape(
dec_output, shape=[-1, dec_output.shape[-1]], inplace=True)
......@@ -748,8 +740,6 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
force_cpu=True)
step_idx = layers.fill_constant(
shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True)
cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True
while_op = layers.While(cond)
# array states will be stored for each step.
ids = layers.array_write(
layers.reshape(start_tokens, (-1, 1)), step_idx)
......@@ -773,21 +763,25 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
dtype=enc_output.dtype,
value=0),
"static_k": # for encoder-decoder attention
layers.create_tensor(dtype=enc_output.dtype),
fluid.data(shape=[None, n_head, 0, d_key], dtype=enc_output.dtype, name=("static_k_%d"%i)),
"static_v": # for encoder-decoder attention
layers.create_tensor(dtype=enc_output.dtype)
fluid.data(shape=[None, n_head, 0, d_value], dtype=enc_output.dtype, name=("static_v_%d"%i)),
} for i in range(n_layer)
]
with while_op.block():
pre_ids = layers.array_read(array=ids, i=step_idx)
# Since beam_search_op dosen't enforce pre_ids' shape, we can do
# inplace reshape here which actually change the shape of pre_ids.
# pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
pre_scores = layers.array_read(array=scores, i=step_idx)
def cond_func(step_idx, selected_ids, selected_scores, gather_idx,
caches, trg_src_attn_bias):
length_cond = layers.less_than(x=step_idx, y=max_len)
finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
return layers.logical_and(x=length_cond, y=finish_cond)
def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches,
trg_src_attn_bias):
# gather cell states corresponding to selected parent
pre_src_attn_bias = layers.gather(
trg_src_attn_bias, index=parent_idx)
pre_caches = map_structure(
lambda x: layers.gather(x, index=gather_idx), caches)
pre_src_attn_bias = layers.gather(trg_src_attn_bias,
index=gather_idx)
pre_pos = layers.elementwise_mul(
x=layers.fill_constant_batch_size_like(
input=pre_src_attn_bias, # cann't use lod tensor here
......@@ -812,14 +806,14 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
postprocess_cmd,
weight_sharing,
enc_output=enc_output,
caches=caches,
gather_idx=parent_idx,
caches=pre_caches,
bos_idx=bos_idx)
# intra-beam topK
topk_scores, topk_indices = layers.topk(
input=layers.softmax(logits), k=beam_size)
accu_scores = layers.elementwise_add(
x=layers.log(topk_scores), y=pre_scores, axis=0)
accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
y=pre_scores,
axis=0)
# beam_search op uses lod to differentiate branches.
accu_scores = layers.lod_reset(accu_scores, pre_ids)
# topK reduction across beams, also contain special handle of
......@@ -832,16 +826,19 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
beam_size=beam_size,
end_id=eos_idx,
return_parent_idx=True)
layers.increment(x=step_idx, value=1.0, in_place=True)
# cell states(caches) have been updated in wrap_decoder,
# only need to update beam search states here.
step_idx = layers.increment(x=step_idx, value=1.0, in_place=False)
layers.array_write(selected_ids, i=step_idx, array=ids)
layers.array_write(selected_scores, i=step_idx, array=scores)
layers.assign(gather_idx, parent_idx)
layers.assign(pre_src_attn_bias, trg_src_attn_bias)
length_cond = layers.less_than(x=step_idx, y=max_len)
finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
return (step_idx, selected_ids, selected_scores, gather_idx,
pre_caches, pre_src_attn_bias)
_ = layers.while_loop(cond=cond_func,
body=body_func,
loop_vars=[
step_idx, start_tokens, init_scores,
parent_idx, caches, trg_src_attn_bias
],
is_test=True)
finished_ids, finished_scores = layers.beam_search_decode(
ids, scores, beam_size=beam_size, end_id=eos_idx)
......
......@@ -11,10 +11,11 @@ init_from_checkpoint: ""
init_from_pretrain_model: ""
# path of trained parameter, to make prediction
init_from_params: "trained_params/step_100000"
save_model_path: ""
# the directory for saving checkpoints.
# the directory for saving models.
save_model_path: "saved_models"
# deprecated, the directory for saving checkpoints.
save_checkpoint: "trained_ckpts"
# the directory for saving trained parameters.
# deprecated, the directory for saving trained parameters.
save_param: "trained_params"
# the directory for saving inference model.
inference_model_dir: "infer_model"
......
import pickle
import six
import warnings
from functools import partial
import paddle.fluid as fluid
def load(program, model_path, executor=None, var_list=None):
"""
To load python2 saved models in python3.
"""
try:
fluid.load(program, model_path, executor, var_list)
except UnicodeDecodeError:
warnings.warn(
"An UnicodeDecodeError is catched, which might be caused by loading "
"a python2 saved model. Encoding of pickle.load would be set and "
"load again automatically.")
if six.PY3:
load_bak = pickle.load
pickle.load = partial(load_bak, encoding="latin1")
fluid.load(program, model_path, executor, var_list)
pickle.load = load_bak
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册