提交 04504173 编写于 作者: R root

test=develop

上级 ccdbfe77
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
## 1. 任务说明 ## 1. 任务说明
本文主要介绍基于lstm的语言的模型的实现,给定一个输入词序列(中文分词、英文tokenize),计算其ppl(语言模型困惑度,用户表示句子的流利程度),基于循环神经网络语言模型的介绍可以[参阅论文](https://arxiv.org/abs/1409.2329)。相对于传统的方法,基于循环神经网络的方法能够更好的解决稀疏词的问题。 本文主要介绍基于lstm的语言的模型的实现,给定一个输入词序列(中文分词、英文tokenize),计算其ppl(语言模型困惑度,用户表示句子的流利程度),基于循环神经网络语言模型的介绍可以[参阅论文](https://arxiv.org/abs/1409.2329)。相对于传统的方法,基于循环神经网络的方法能够更好的解决稀疏词的问题。
**目前语言模型要求使用PaddlePaddle 1.7及以上版本或适当的develop版本。** **目前语言模型要求使用PaddlePaddle 1.8及以上版本或适当的develop版本。**
同时推荐用户参考[IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122290) 同时推荐用户参考[IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122290)
...@@ -53,7 +53,7 @@ cd data; sh download_data.sh ...@@ -53,7 +53,7 @@ cd data; sh download_data.sh
### 训练或fine-tune ### 训练或fine-tune
任务训练启动命令如下: 任务训练启动命令如下:
``` ```
sh run.sh bash run.sh
``` ```
需要指定数据的目录,模型的大小(默认为small,用户可以选择medium, 或者large)。 需要指定数据的目录,模型的大小(默认为small,用户可以选择medium, 或者large)。
......
#!/bin/bash #!/bin/bash
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7
function run_train() { function run_train ()
{
echo "training" echo "training"
python train.py \ python3 train.py \
--data_path data/simple-examples/data/ \ --data_path data/simple-examples/data/ \
--model_type small \ --model_type small \
--use_gpu True \ --use_gpu True \
......
...@@ -137,9 +137,8 @@ def main(): ...@@ -137,9 +137,8 @@ def main():
res_vars = res_vars[:-1] res_vars = res_vars[:-1]
loss, last_hidden, last_cell, feed_order = res_vars loss, last_hidden, last_cell, feed_order = res_vars
fluid.clip.set_gradient_clip( clip1 = fluid.clip.GradientClipByGlobalNorm(
clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=config.max_grad_norm)
clip_norm=config.max_grad_norm))
learning_rate = fluid.layers.create_global_var( learning_rate = fluid.layers.create_global_var(
name="learning_rate", name="learning_rate",
...@@ -148,7 +147,8 @@ def main(): ...@@ -148,7 +147,8 @@ def main():
dtype='float32', dtype='float32',
persistable=True) persistable=True)
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate,
grad_clip=clip1)
optimizer.minimize(loss) optimizer.minimize(loss)
# define inference program # define inference program
...@@ -471,7 +471,7 @@ def main(): ...@@ -471,7 +471,7 @@ def main():
mkpath(save_model_dir) mkpath(save_model_dir)
save_model_dir = os.path.join(save_model_dir, 'params') save_model_dir = os.path.join(save_model_dir, 'params')
fluid.save(main_program, save_model_dir) fluid.save(program=main_program, model_path=save_model_dir)
print("Saved model to: %s.\n" % save_model_dir) print("Saved model to: %s.\n" % save_model_dir)
with profile_context(args.profile, args.profiler_path): with profile_context(args.profile, args.profiler_path):
......
...@@ -72,7 +72,7 @@ ...@@ -72,7 +72,7 @@
``` ```
## 安装 ## 安装
本项目依赖于 Paddle Fluid **1.7.1** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 本项目依赖于 Paddle Fluid **1.8.0** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。
## 预训练 ## 预训练
......
...@@ -183,7 +183,7 @@ def convert(args): ...@@ -183,7 +183,7 @@ def convert(args):
param]).get_tensor().set(value, place) param]).get_tensor().set(value, place)
print(param, ' --> ', tf_fluid_param_name_map[param], ' ', value.shape) print(param, ' --> ', tf_fluid_param_name_map[param], ' ', value.shape)
fluid.io.save_params(exe, args.fluid_params_dir, main_program=program) fluid.save(model_path=args.fluid_params_dir, main_program=program)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -102,9 +102,10 @@ def optimization(loss, ...@@ -102,9 +102,10 @@ def optimization(loss,
raise ValueError("Unkown learning rate scheduler, should be " raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'") "'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) optimizer = fluid.optimizer.Adam(
learning_rate=scheduled_lr, grad_clip=clip1)
def exclude_from_weight_decay(param): def exclude_from_weight_decay(param):
name = param.name.rstrip(".master") name = param.name.rstrip(".master")
......
...@@ -32,7 +32,7 @@ VALIDATION_DATA_DIR=data/validation ...@@ -32,7 +32,7 @@ VALIDATION_DATA_DIR=data/validation
CONFIG_PATH=data/demo_config/bert_config.json CONFIG_PATH=data/demo_config/bert_config.json
VOCAB_PATH=data/demo_config/vocab.txt VOCAB_PATH=data/demo_config/vocab.txt
# Change your train arguments: # Change your train arguments:
python -u ./train.py ${is_distributed}\ python3 -u ./train.py ${is_distributed}\
--use_cuda true\ --use_cuda true\
--weight_sharing true\ --weight_sharing true\
--batch_size ${BATCH_SIZE} \ --batch_size ${BATCH_SIZE} \
......
...@@ -619,7 +619,8 @@ def transformer_xl(inp_k, ...@@ -619,7 +619,8 @@ def transformer_xl(inp_k,
attr=fluid.ParamAttr( attr=fluid.ParamAttr(
name=name + '_word_embedding', initializer=initializer), name=name + '_word_embedding', initializer=initializer),
is_bias=False) is_bias=False)
word_emb_k = fluid.layers.embedding( inp_k = fluid.layers.reshape(inp_k, shape=[inp_k.shape[0], -1])
word_emb_k = fluid.embedding(
input=inp_k, input=inp_k,
size=[n_token, d_model], size=[n_token, d_model],
dtype=data_type, dtype=data_type,
...@@ -693,8 +694,7 @@ def transformer_xl(inp_k, ...@@ -693,8 +694,7 @@ def transformer_xl(inp_k,
dtype='int64') dtype='int64')
seg_mat = fluid.layers.transpose(seg_mat, perm=[1, 2, 0]) seg_mat = fluid.layers.transpose(seg_mat, perm=[1, 2, 0])
seg_mat = fluid.layers.unsqueeze(seg_mat, [-1]) seg_mat = fluid.one_hot(seg_mat, 2)
seg_mat = fluid.layers.one_hot(seg_mat, 2)
seg_mat.stop_gradient = True seg_mat.stop_gradient = True
else: else:
seg_mat = None seg_mat = None
...@@ -899,7 +899,7 @@ def classification_loss(hidden, ...@@ -899,7 +899,7 @@ def classification_loss(hidden,
initializer=initializer), initializer=initializer),
bias_attr=name + '_logit_bias') bias_attr=name + '_logit_bias')
one_hot_target = fluid.layers.one_hot(labels, depth=n_class) one_hot_target = fluid.one_hot(labels, depth=n_class)
loss = -1.0 * fluid.layers.reduce_sum( loss = -1.0 * fluid.layers.reduce_sum(
log_softmax(logits) * one_hot_target, dim=-1) log_softmax(logits) * one_hot_target, dim=-1)
......
...@@ -110,10 +110,10 @@ def optimization(loss, ...@@ -110,10 +110,10 @@ def optimization(loss,
return True return True
return False return False
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
fluid.clip.set_gradient_clip( optimizer = fluid.optimizer.Adam(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) learning_rate=scheduled_lr, grad_clip=clip1)
param_list = dict() param_list = dict()
......
...@@ -69,7 +69,7 @@ init_g.add_arg("init_std", str, 0.02, "Initialization std when init is norm ...@@ -69,7 +69,7 @@ init_g.add_arg("init_std", str, 0.02, "Initialization std when init is norm
init_g.add_arg("init_range", str, 0.1, "Initialization std when init is uniform.") init_g.add_arg("init_range", str, 0.1, "Initialization std when init is uniform.")
train_g = ArgumentGroup(parser, "training", "training options.") train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 1000, "Number of epoches for fine-tuning.") train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
...@@ -415,7 +415,7 @@ def main(args): ...@@ -415,7 +415,7 @@ def main(args):
if steps % args.save_steps == 0: if steps % args.save_steps == 0:
save_path = os.path.join(args.checkpoints, save_path = os.path.join(args.checkpoints,
"step_" + str(steps)) "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program) fluid.save(model_path=save_path, program=train_program)
if steps % args.validation_steps == 0: if steps % args.validation_steps == 0:
print("Average throughtput: %s" % (np.average(throughput))) print("Average throughtput: %s" % (np.average(throughput)))
...@@ -427,7 +427,7 @@ def main(args): ...@@ -427,7 +427,7 @@ def main(args):
args.eval_split, processor.get_num_examples(phase=args.eval_split)) args.eval_split, processor.get_num_examples(phase=args.eval_split))
except fluid.core.EOFException: except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, "step_" + str(steps)) save_path = os.path.join(args.checkpoints, "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program) fluid.save(model_path=save_path, program=train_program)
train_data_loader.reset() train_data_loader.reset()
break break
if args.enable_ce: if args.enable_ce:
......
...@@ -155,7 +155,9 @@ def get_qa_outputs(xlnet_config, features, is_training=False): ...@@ -155,7 +155,9 @@ def get_qa_outputs(xlnet_config, features, is_training=False):
# logit of the end position # logit of the end position
if is_training: if is_training:
start_positions = features['start_positions'] start_positions = features['start_positions']
start_index = fluid.layers.one_hot(start_positions, depth=args.max_seq_length) start_positions = fluid.squeeze(start_positions, [-1])
start_index = fluid.one_hot(start_positions, depth=args.max_seq_length)
# lbh,bl->bh # lbh,bl->bh
trans_out = fluid.layers.transpose(output, perm=[1, 2, 0]) trans_out = fluid.layers.transpose(output, perm=[1, 2, 0])
start_index = fluid.layers.unsqueeze(start_index, axes=[2]) start_index = fluid.layers.unsqueeze(start_index, axes=[2])
...@@ -193,8 +195,7 @@ def get_qa_outputs(xlnet_config, features, is_training=False): ...@@ -193,8 +195,7 @@ def get_qa_outputs(xlnet_config, features, is_training=False):
end_log_probs = log_softmax(end_logits_masked) end_log_probs = log_softmax(end_logits_masked)
else: else:
start_top_log_probs, start_top_index = fluid.layers.topk(start_log_probs, k=args.start_n_top) start_top_log_probs, start_top_index = fluid.layers.topk(start_log_probs, k=args.start_n_top)
start_top_index = fluid.layers.unsqueeze(start_top_index, [-1]) start_index = fluid.one_hot(start_top_index, seq_len)
start_index = fluid.layers.one_hot(start_top_index, seq_len)
# lbh,bkl->bkh # lbh,bkl->bkh
trans_out = fluid.layers.transpose(output, perm=[1, 2, 0]) trans_out = fluid.layers.transpose(output, perm=[1, 2, 0])
trans_start_index = fluid.layers.transpose(start_index, [0, 2, 1]) trans_start_index = fluid.layers.transpose(start_index, [0, 2, 1])
...@@ -249,7 +250,8 @@ def get_qa_outputs(xlnet_config, features, is_training=False): ...@@ -249,7 +250,8 @@ def get_qa_outputs(xlnet_config, features, is_training=False):
return_dict["end_top_log_probs"] = end_top_log_probs return_dict["end_top_log_probs"] = end_top_log_probs
return_dict["end_top_index"] = end_top_index return_dict["end_top_index"] = end_top_index
cls_index = fluid.layers.one_hot(cls_index, seq_len) cls_index = fluid.squeeze(cls_index, [-1])
cls_index = fluid.one_hot(cls_index, seq_len)
cls_index = fluid.layers.unsqueeze(cls_index, axes=[2]) cls_index = fluid.layers.unsqueeze(cls_index, axes=[2])
cls_feature = fluid.layers.matmul(x=trans_out, y=cls_index) cls_feature = fluid.layers.matmul(x=trans_out, y=cls_index)
...@@ -335,8 +337,8 @@ def create_model(xlnet_config, is_training=False): ...@@ -335,8 +337,8 @@ def create_model(xlnet_config, is_training=False):
seq_len = input_ids.shape[1] seq_len = input_ids.shape[1]
def compute_loss(log_probs, positions): def compute_loss(log_probs, positions):
one_hot_positions = fluid.layers.one_hot(positions, depth=seq_len) one_hot_positions = fluid.squeeze(positions,[-1])
one_hot_positions = fluid.one_hot(positions, depth=seq_len)
loss = -1 * fluid.layers.reduce_sum(one_hot_positions * log_probs, dim=-1) loss = -1 * fluid.layers.reduce_sum(one_hot_positions * log_probs, dim=-1)
loss = fluid.layers.reduce_mean(loss) loss = fluid.layers.reduce_mean(loss)
return loss return loss
...@@ -581,11 +583,11 @@ def train(args): ...@@ -581,11 +583,11 @@ def train(args):
if steps % args.save_steps == 0 or steps == args.train_steps: if steps % args.save_steps == 0 or steps == args.train_steps:
save_path = os.path.join(args.checkpoints, save_path = os.path.join(args.checkpoints,
"step_" + str(steps)) "step_" + str(steps))
fluid.io.save_persistables(exe, save_path, train_program) fluid.save(model_path=save_path, program=train_program)
except fluid.core.EOFException: except fluid.core.EOFException:
save_path = os.path.join(args.checkpoints, save_path = os.path.join(args.checkpoints,
"step_" + str(steps) + "_final") "step_" + str(steps) + "_final")
fluid.io.save_persistables(exe, save_path, train_program) fluid.save(model_path=save_path, program=train_program)
train_data_loader.reset() train_data_loader.reset()
break break
print("Finish model training ...") print("Finish model training ...")
......
...@@ -54,11 +54,8 @@ def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): ...@@ -54,11 +54,8 @@ def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
print("SKIP %s" % var.name) print("SKIP %s" % var.name)
return False return False
fluid.io.load_vars( fluid.load(
exe, model_path=init_checkpoint_path, program=main_program, executor=exe)
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
if use_fp16: if use_fp16:
cast_fp32_to_fp16(exe, main_program) cast_fp32_to_fp16(exe, main_program)
...@@ -83,11 +80,8 @@ def init_pretraining_params(exe, ...@@ -83,11 +80,8 @@ def init_pretraining_params(exe,
print("SKIP %s" % var.name) print("SKIP %s" % var.name)
return False return False
fluid.io.load_vars( fluid.io.load(
exe, model_path=pretraining_params_path, program=main_program, executor=exe)
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
if use_fp16: if use_fp16:
cast_fp32_to_fp16(exe, main_program) cast_fp32_to_fp16(exe, main_program)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册