提交 d4b0ac19 编写于 作者: J Joejiong

update 1.8 pre-trained language model. test=develop

上级 4d7ec517
......@@ -5,7 +5,7 @@
## 1. 任务说明
本文主要介绍基于lstm的语言的模型的实现,给定一个输入词序列(中文分词、英文tokenize),计算其ppl(语言模型困惑度,用户表示句子的流利程度),基于循环神经网络语言模型的介绍可以[参阅论文](https://arxiv.org/abs/1409.2329)。相对于传统的方法,基于循环神经网络的方法能够更好的解决稀疏词的问题。
**目前语言模型要求使用PaddlePaddle 1.7及以上版本或适当的develop版本。**
**目前语言模型要求使用PaddlePaddle 1.8及以上版本或适当的develop版本。**
同时推荐用户参考[IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122290)
......@@ -53,7 +53,7 @@ cd data; sh download_data.sh
### 训练或fine-tune
任务训练启动命令如下:
```
sh run.sh
bash run.sh
```
需要指定数据的目录,模型的大小(默认为small,用户可以选择medium, 或者large)。
......
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
function run_train() {
function run_train ()
{
echo "training"
python train.py \
--data_path data/simple-examples/data/ \
......
......@@ -137,9 +137,8 @@ def main():
res_vars = res_vars[:-1]
loss, last_hidden, last_cell, feed_order = res_vars
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(
clip_norm=config.max_grad_norm))
clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=config.max_grad_norm)
learning_rate = fluid.layers.create_global_var(
name="learning_rate",
......@@ -148,7 +147,8 @@ def main():
dtype='float32',
persistable=True)
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate,
grad_clip=clip1)
optimizer.minimize(loss)
# define inference program
......@@ -471,7 +471,7 @@ def main():
mkpath(save_model_dir)
save_model_dir = os.path.join(save_model_dir, 'params')
fluid.save(main_program, save_model_dir)
fluid.save(program=main_program, model_path=save_model_dir)
print("Saved model to: %s.\n" % save_model_dir)
with profile_context(args.profile, args.profiler_path):
......
......@@ -72,7 +72,7 @@
```
## 安装
本项目依赖于 Paddle Fluid **1.7.1** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。
本项目依赖于 Paddle Fluid **1.8.0** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。
## 预训练
......
......@@ -183,7 +183,7 @@ def convert(args):
param]).get_tensor().set(value, place)
print(param, ' --> ', tf_fluid_param_name_map[param], ' ', value.shape)
fluid.io.save_params(exe, args.fluid_params_dir, main_program=program)
fluid.save(model_path=args.fluid_params_dir, main_program=program)
if __name__ == '__main__':
......
......@@ -102,9 +102,10 @@ def optimization(loss,
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
optimizer = fluid.optimizer.Adam(
learning_rate=scheduled_lr, grad_clip=clip1)
def exclude_from_weight_decay(param):
name = param.name.rstrip(".master")
......
......@@ -619,7 +619,8 @@ def transformer_xl(inp_k,
attr=fluid.ParamAttr(
name=name + '_word_embedding', initializer=initializer),
is_bias=False)
word_emb_k = fluid.layers.embedding(
inp_k = fluid.layers.reshape(inp_k, shape=[inp_k.shape[0], -1])
word_emb_k = fluid.embedding(
input=inp_k,
size=[n_token, d_model],
dtype=data_type,
......@@ -693,8 +694,7 @@ def transformer_xl(inp_k,
dtype='int64')
seg_mat = fluid.layers.transpose(seg_mat, perm=[1, 2, 0])
seg_mat = fluid.layers.unsqueeze(seg_mat, [-1])
seg_mat = fluid.layers.one_hot(seg_mat, 2)
seg_mat = fluid.one_hot(seg_mat, 2)
seg_mat.stop_gradient = True
else:
seg_mat = None
......@@ -899,7 +899,7 @@ def classification_loss(hidden,
initializer=initializer),
bias_attr=name + '_logit_bias')
one_hot_target = fluid.layers.one_hot(labels, depth=n_class)
one_hot_target = fluid.one_hot(labels, depth=n_class)
loss = -1.0 * fluid.layers.reduce_sum(
log_softmax(logits) * one_hot_target, dim=-1)
......
......@@ -110,10 +110,10 @@ def optimization(loss,
return True
return False
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
optimizer = fluid.optimizer.Adam(
learning_rate=scheduled_lr, grad_clip=clip1)
param_list = dict()
......
......@@ -69,7 +69,7 @@ init_g.add_arg("init_std", str, 0.02, "Initialization std when init is norm
init_g.add_arg("init_range", str, 0.1, "Initialization std when init is uniform.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 1000, "Number of epoches for fine-tuning.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
......@@ -415,7 +415,7 @@ def main(args):
if steps % args.save_steps == 0:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
fluid.save(model_path=save_path, program=train_program)
if steps % args.validation_steps == 0:
print("Average throughtput: %s" % (np.average(throughput)))
......@@ -427,7 +427,7 @@ def main(args):
args.eval_split, processor.get_num_examples(phase=args.eval_split))
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
fluid.save(model_path=save_path, program=train_program)
train_data_loader.reset()
break
if args.enable_ce:
......
......@@ -155,7 +155,9 @@ def get_qa_outputs(xlnet_config, features, is_training=False):
# logit of the end position
if is_training:
start_positions = features['start_positions']
start_index = fluid.layers.one_hot(start_positions, depth=args.max_seq_length)
start_positions = fluid.squeeze(start_positions, [-1])
start_index = fluid.one_hot(start_positions, depth=args.max_seq_length)
# lbh,bl->bh
trans_out = fluid.layers.transpose(output, perm=[1, 2, 0])
start_index = fluid.layers.unsqueeze(start_index, axes=[2])
......@@ -193,8 +195,7 @@ def get_qa_outputs(xlnet_config, features, is_training=False):
end_log_probs = log_softmax(end_logits_masked)
else:
start_top_log_probs, start_top_index = fluid.layers.topk(start_log_probs, k=args.start_n_top)
start_top_index = fluid.layers.unsqueeze(start_top_index, [-1])
start_index = fluid.layers.one_hot(start_top_index, seq_len)
start_index = fluid.one_hot(start_top_index, seq_len)
# lbh,bkl->bkh
trans_out = fluid.layers.transpose(output, perm=[1, 2, 0])
trans_start_index = fluid.layers.transpose(start_index, [0, 2, 1])
......@@ -249,7 +250,8 @@ def get_qa_outputs(xlnet_config, features, is_training=False):
return_dict["end_top_log_probs"] = end_top_log_probs
return_dict["end_top_index"] = end_top_index
cls_index = fluid.layers.one_hot(cls_index, seq_len)
cls_index = fluid.squeeze(cls_index, [-1])
cls_index = fluid.one_hot(cls_index, seq_len)
cls_index = fluid.layers.unsqueeze(cls_index, axes=[2])
cls_feature = fluid.layers.matmul(x=trans_out, y=cls_index)
......@@ -335,8 +337,8 @@ def create_model(xlnet_config, is_training=False):
seq_len = input_ids.shape[1]
def compute_loss(log_probs, positions):
one_hot_positions = fluid.layers.one_hot(positions, depth=seq_len)
one_hot_positions = fluid.squeeze(positions,[-1])
one_hot_positions = fluid.one_hot(positions, depth=seq_len)
loss = -1 * fluid.layers.reduce_sum(one_hot_positions * log_probs, dim=-1)
loss = fluid.layers.reduce_mean(loss)
return loss
......@@ -581,11 +583,11 @@ def train(args):
if steps % args.save_steps == 0 or steps == args.train_steps:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program)
fluid.save(model_path=save_path, program=train_program)
except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program)
fluid.save(model_path=save_path, program=train_program)
train_data_loader.reset()
break
print("Finish model training ...")
......
......@@ -54,11 +54,8 @@ def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
print("SKIP %s" % var.name)
return False
fluid.io.load_vars(
exe,
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
fluid.load(
model_path=init_checkpoint_path, program=main_program, executor=exe)
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
......@@ -83,11 +80,8 @@ def init_pretraining_params(exe,
print("SKIP %s" % var.name)
return False
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
fluid.io.load(
model_path=pretraining_params_path, program=main_program, executor=exe)
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册