diff --git a/PaddleNLP/language_model/README.md b/PaddleNLP/language_model/README.md index 8bf919a341c7b95eb58827c46c2111439164fcf1..b752fac50152f3cf7c37d210eb5ce137bcdf51b9 100644 --- a/PaddleNLP/language_model/README.md +++ b/PaddleNLP/language_model/README.md @@ -5,7 +5,7 @@ ## 1. 任务说明 本文主要介绍基于lstm的语言的模型的实现,给定一个输入词序列(中文分词、英文tokenize),计算其ppl(语言模型困惑度,用户表示句子的流利程度),基于循环神经网络语言模型的介绍可以[参阅论文](https://arxiv.org/abs/1409.2329)。相对于传统的方法,基于循环神经网络的方法能够更好的解决稀疏词的问题。 -**目前语言模型要求使用PaddlePaddle 1.7及以上版本或适当的develop版本。** +**目前语言模型要求使用PaddlePaddle 1.8及以上版本或适当的develop版本。** 同时推荐用户参考[IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122290) @@ -53,7 +53,7 @@ cd data; sh download_data.sh ### 训练或fine-tune 任务训练启动命令如下: ``` -sh run.sh +bash run.sh ``` 需要指定数据的目录,模型的大小(默认为small,用户可以选择medium, 或者large)。 diff --git a/PaddleNLP/language_model/run.sh b/PaddleNLP/language_model/run.sh index f010d910233ffcd59c62d63553dc803b19e06c36..0fb71feb560e2ea0ed4da496019d4ed9b0a5f942 100644 --- a/PaddleNLP/language_model/run.sh +++ b/PaddleNLP/language_model/run.sh @@ -1,9 +1,10 @@ #!/bin/bash -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 -function run_train() { +function run_train () +{ echo "training" - python train.py \ + python3 train.py \ --data_path data/simple-examples/data/ \ --model_type small \ --use_gpu True \ diff --git a/PaddleNLP/language_model/train.py b/PaddleNLP/language_model/train.py index 3cc04364d79e8c9f8b02fe028554054666fea32f..f12e9431f105f4ab05817294e90831d00fbe1416 100644 --- a/PaddleNLP/language_model/train.py +++ b/PaddleNLP/language_model/train.py @@ -137,9 +137,8 @@ def main(): res_vars = res_vars[:-1] loss, last_hidden, last_cell, feed_order = res_vars - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm( - clip_norm=config.max_grad_norm)) + clip1 = fluid.clip.GradientClipByGlobalNorm( + clip_norm=config.max_grad_norm) learning_rate = fluid.layers.create_global_var( name="learning_rate", @@ -148,7 +147,8 @@ def main(): dtype='float32', persistable=True) - optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + optimizer = fluid.optimizer.SGD(learning_rate=learning_rate, + grad_clip=clip1) optimizer.minimize(loss) # define inference program @@ -471,7 +471,7 @@ def main(): mkpath(save_model_dir) save_model_dir = os.path.join(save_model_dir, 'params') - fluid.save(main_program, save_model_dir) + fluid.save(program=main_program, model_path=save_model_dir) print("Saved model to: %s.\n" % save_model_dir) with profile_context(args.profile, args.profiler_path): diff --git a/PaddleNLP/pretrain_language_models/BERT/README.md b/PaddleNLP/pretrain_language_models/BERT/README.md index 7280a09cbd9f92c6a6b2fc253e44173b6cbd8712..cb96f293e4d601c0be69049d3149e5ffe4100cb9 100644 --- a/PaddleNLP/pretrain_language_models/BERT/README.md +++ b/PaddleNLP/pretrain_language_models/BERT/README.md @@ -72,7 +72,7 @@ ``` ## 安装 -本项目依赖于 Paddle Fluid **1.7.1** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 +本项目依赖于 Paddle Fluid **1.8.0** 及以上版本,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 ## 预训练 diff --git a/PaddleNLP/pretrain_language_models/BERT/convert_params.py b/PaddleNLP/pretrain_language_models/BERT/convert_params.py index e2930defa01c6ab9be6e3f24e5956ce2dbecf4da..c0f13720110836089d3d42c61c965bd30245025e 100644 --- a/PaddleNLP/pretrain_language_models/BERT/convert_params.py +++ b/PaddleNLP/pretrain_language_models/BERT/convert_params.py @@ -183,7 +183,7 @@ def convert(args): param]).get_tensor().set(value, place) print(param, ' --> ', tf_fluid_param_name_map[param], ' ', value.shape) - fluid.io.save_params(exe, args.fluid_params_dir, main_program=program) + fluid.save(model_path=args.fluid_params_dir, main_program=program) if __name__ == '__main__': diff --git a/PaddleNLP/pretrain_language_models/BERT/optimization.py b/PaddleNLP/pretrain_language_models/BERT/optimization.py index 82ade38974e40cd46dd2f0bf2e37e99e412dd7e2..7a6dc3d366d4c4e18fd83d0d4046ad88d86dc2fd 100644 --- a/PaddleNLP/pretrain_language_models/BERT/optimization.py +++ b/PaddleNLP/pretrain_language_models/BERT/optimization.py @@ -102,9 +102,10 @@ def optimization(loss, raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") - optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) + clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + + optimizer = fluid.optimizer.Adam( + learning_rate=scheduled_lr, grad_clip=clip1) def exclude_from_weight_decay(param): name = param.name.rstrip(".master") diff --git a/PaddleNLP/pretrain_language_models/BERT/train.sh b/PaddleNLP/pretrain_language_models/BERT/train.sh index 2ed44e4c7291f502c70bfad4422f7ad999439c58..4dd00e0d547845d54a334843551f7c22b471ab13 100755 --- a/PaddleNLP/pretrain_language_models/BERT/train.sh +++ b/PaddleNLP/pretrain_language_models/BERT/train.sh @@ -32,7 +32,7 @@ VALIDATION_DATA_DIR=data/validation CONFIG_PATH=data/demo_config/bert_config.json VOCAB_PATH=data/demo_config/vocab.txt # Change your train arguments: -python -u ./train.py ${is_distributed}\ +python3 -u ./train.py ${is_distributed}\ --use_cuda true\ --weight_sharing true\ --batch_size ${BATCH_SIZE} \ diff --git a/PaddleNLP/pretrain_language_models/XLNet/modeling.py b/PaddleNLP/pretrain_language_models/XLNet/modeling.py index 032428abf224819f758bfa0d84ddff8be4dc1347..2948896d84744ce8089a4b491aada189a4cb9b0e 100644 --- a/PaddleNLP/pretrain_language_models/XLNet/modeling.py +++ b/PaddleNLP/pretrain_language_models/XLNet/modeling.py @@ -619,7 +619,8 @@ def transformer_xl(inp_k, attr=fluid.ParamAttr( name=name + '_word_embedding', initializer=initializer), is_bias=False) - word_emb_k = fluid.layers.embedding( + inp_k = fluid.layers.reshape(inp_k, shape=[inp_k.shape[0], -1]) + word_emb_k = fluid.embedding( input=inp_k, size=[n_token, d_model], dtype=data_type, @@ -693,8 +694,7 @@ def transformer_xl(inp_k, dtype='int64') seg_mat = fluid.layers.transpose(seg_mat, perm=[1, 2, 0]) - seg_mat = fluid.layers.unsqueeze(seg_mat, [-1]) - seg_mat = fluid.layers.one_hot(seg_mat, 2) + seg_mat = fluid.one_hot(seg_mat, 2) seg_mat.stop_gradient = True else: seg_mat = None @@ -899,7 +899,7 @@ def classification_loss(hidden, initializer=initializer), bias_attr=name + '_logit_bias') - one_hot_target = fluid.layers.one_hot(labels, depth=n_class) + one_hot_target = fluid.one_hot(labels, depth=n_class) loss = -1.0 * fluid.layers.reduce_sum( log_softmax(logits) * one_hot_target, dim=-1) diff --git a/PaddleNLP/pretrain_language_models/XLNet/optimization.py b/PaddleNLP/pretrain_language_models/XLNet/optimization.py index 31cb069f71df58f4db232f1146bf0aea7a5fbb0d..0911b33caa417b934e19d78af87d694120cd773a 100644 --- a/PaddleNLP/pretrain_language_models/XLNet/optimization.py +++ b/PaddleNLP/pretrain_language_models/XLNet/optimization.py @@ -110,10 +110,10 @@ def optimization(loss, return True return False - optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + clip1 = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) + optimizer = fluid.optimizer.Adam( + learning_rate=scheduled_lr, grad_clip=clip1) param_list = dict() diff --git a/PaddleNLP/pretrain_language_models/XLNet/run_classifier.py b/PaddleNLP/pretrain_language_models/XLNet/run_classifier.py index 795eae548cdb0137856b46ab8e43a4361098c9c8..3efeab3e67db961af635a3e6a76ff6f3e3fc0642 100644 --- a/PaddleNLP/pretrain_language_models/XLNet/run_classifier.py +++ b/PaddleNLP/pretrain_language_models/XLNet/run_classifier.py @@ -69,7 +69,7 @@ init_g.add_arg("init_std", str, 0.02, "Initialization std when init is norm init_g.add_arg("init_range", str, 0.1, "Initialization std when init is uniform.") train_g = ArgumentGroup(parser, "training", "training options.") -train_g.add_arg("epoch", int, 1000, "Number of epoches for fine-tuning.") +train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.") train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.") train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay']) @@ -415,7 +415,7 @@ def main(args): if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) - fluid.io.save_persistables(exe, save_path, train_program) + fluid.save(model_path=save_path, program=train_program) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) @@ -427,7 +427,7 @@ def main(args): args.eval_split, processor.get_num_examples(phase=args.eval_split)) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) - fluid.io.save_persistables(exe, save_path, train_program) + fluid.save(model_path=save_path, program=train_program) train_data_loader.reset() break if args.enable_ce: diff --git a/PaddleNLP/pretrain_language_models/XLNet/run_squad.py b/PaddleNLP/pretrain_language_models/XLNet/run_squad.py index 39ee6ff2fe9d2645e17120555606898a79abd3d7..7b1cd205ff1d6da54800c124cf275509403e0db9 100644 --- a/PaddleNLP/pretrain_language_models/XLNet/run_squad.py +++ b/PaddleNLP/pretrain_language_models/XLNet/run_squad.py @@ -155,7 +155,9 @@ def get_qa_outputs(xlnet_config, features, is_training=False): # logit of the end position if is_training: start_positions = features['start_positions'] - start_index = fluid.layers.one_hot(start_positions, depth=args.max_seq_length) + start_positions = fluid.squeeze(start_positions, [-1]) + start_index = fluid.one_hot(start_positions, depth=args.max_seq_length) + # lbh,bl->bh trans_out = fluid.layers.transpose(output, perm=[1, 2, 0]) start_index = fluid.layers.unsqueeze(start_index, axes=[2]) @@ -193,8 +195,7 @@ def get_qa_outputs(xlnet_config, features, is_training=False): end_log_probs = log_softmax(end_logits_masked) else: start_top_log_probs, start_top_index = fluid.layers.topk(start_log_probs, k=args.start_n_top) - start_top_index = fluid.layers.unsqueeze(start_top_index, [-1]) - start_index = fluid.layers.one_hot(start_top_index, seq_len) + start_index = fluid.one_hot(start_top_index, seq_len) # lbh,bkl->bkh trans_out = fluid.layers.transpose(output, perm=[1, 2, 0]) trans_start_index = fluid.layers.transpose(start_index, [0, 2, 1]) @@ -249,7 +250,8 @@ def get_qa_outputs(xlnet_config, features, is_training=False): return_dict["end_top_log_probs"] = end_top_log_probs return_dict["end_top_index"] = end_top_index - cls_index = fluid.layers.one_hot(cls_index, seq_len) + cls_index = fluid.squeeze(cls_index, [-1]) + cls_index = fluid.one_hot(cls_index, seq_len) cls_index = fluid.layers.unsqueeze(cls_index, axes=[2]) cls_feature = fluid.layers.matmul(x=trans_out, y=cls_index) @@ -335,8 +337,8 @@ def create_model(xlnet_config, is_training=False): seq_len = input_ids.shape[1] def compute_loss(log_probs, positions): - one_hot_positions = fluid.layers.one_hot(positions, depth=seq_len) - + one_hot_positions = fluid.squeeze(positions,[-1]) + one_hot_positions = fluid.one_hot(positions, depth=seq_len) loss = -1 * fluid.layers.reduce_sum(one_hot_positions * log_probs, dim=-1) loss = fluid.layers.reduce_mean(loss) return loss @@ -581,11 +583,11 @@ def train(args): if steps % args.save_steps == 0 or steps == args.train_steps: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) - fluid.io.save_persistables(exe, save_path, train_program) + fluid.save(model_path=save_path, program=train_program) except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final") - fluid.io.save_persistables(exe, save_path, train_program) + fluid.save(model_path=save_path, program=train_program) train_data_loader.reset() break print("Finish model training ...") diff --git a/PaddleNLP/pretrain_language_models/XLNet/utils/init.py b/PaddleNLP/pretrain_language_models/XLNet/utils/init.py index d495a42edffe644389d393b657d2c2e64bb5df0d..09632a0f3d917e7f670be155a4cb98a7f1078f28 100644 --- a/PaddleNLP/pretrain_language_models/XLNet/utils/init.py +++ b/PaddleNLP/pretrain_language_models/XLNet/utils/init.py @@ -54,11 +54,8 @@ def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False): print("SKIP %s" % var.name) return False - fluid.io.load_vars( - exe, - init_checkpoint_path, - main_program=main_program, - predicate=existed_persitables) + fluid.load( + model_path=init_checkpoint_path, program=main_program, executor=exe) if use_fp16: cast_fp32_to_fp16(exe, main_program) @@ -83,11 +80,8 @@ def init_pretraining_params(exe, print("SKIP %s" % var.name) return False - fluid.io.load_vars( - exe, - pretraining_params_path, - main_program=main_program, - predicate=existed_params) + fluid.io.load( + model_path=pretraining_params_path, program=main_program, executor=exe) if use_fp16: cast_fp32_to_fp16(exe, main_program)