From 5a41865d3b2a597ba703f0c57c8520f79ff74656 Mon Sep 17 00:00:00 2001 From: 0YuanZhang0 <953963890@qq.com> Date: Fri, 8 May 2020 20:56:39 +0800 Subject: [PATCH] Upgrade dialogue models to paddle 1.8 (#4594) * upgrade_dialogue_models * fix_upgrade_models_comments --- .../dialogue_domain_classification/README.MD | 4 +- .../dialogue_domain_classification/run.sh | 0 .../run_classifier.py | 2 +- .../dialogue_domain_classification/utils.py | 3 +- .../auto_dialogue_evaluation/README.md | 13 ++---- .../ade/prepare_data_and_model.py | 2 +- .../auto_dialogue_evaluation/ade_net.py | 4 +- .../data/config/ade.yaml | 3 -- .../inference_model.py | 7 ++- .../auto_dialogue_evaluation/predict.py | 12 +++-- .../auto_dialogue_evaluation/run.sh | 8 ++-- .../auto_dialogue_evaluation/train.py | 44 +++++++++---------- .../dialogue_general_understanding/README.md | 12 ++--- .../data/config/dgu.yaml | 3 -- .../dgu/bert.py | 6 +-- .../dgu/define_paradigm.py | 4 +- .../dgu/define_predict_pack.py | 1 - .../dgu/optimization.py | 26 ++++++----- .../dgu/prepare_data_and_model.py | 2 +- .../inference_model.py | 7 +-- .../dialogue_general_understanding/main.py | 1 - .../dialogue_general_understanding/predict.py | 15 ++----- .../dialogue_general_understanding/run.sh | 8 ++-- .../dialogue_general_understanding/train.py | 44 +++++++------------ 24 files changed, 92 insertions(+), 139 deletions(-) mode change 100755 => 100644 PaddleNLP/dialogue_domain_classification/run.sh diff --git a/PaddleNLP/dialogue_domain_classification/README.MD b/PaddleNLP/dialogue_domain_classification/README.MD index c5753194..2eb031e1 100755 --- a/PaddleNLP/dialogue_domain_classification/README.MD +++ b/PaddleNLP/dialogue_domain_classification/README.MD @@ -10,11 +10,11 @@ ## 快速开始 -**目前模型要求使用PaddlePaddle 1.6及以上版本或适当的develop版本运行。** +**目前模型要求使用PaddlePaddle 1.8及以上版本或适当的develop版本运行。** ### 1. Paddle版本安装 -本项目训练模块兼容Python2.7.x以及Python3.7.x, 依赖PaddlePaddle 1.6版本以及CentOS系统环境, 安装请参考官网 [快速安装](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/index_cn.html)。 +本项目训练模块兼容Python2.7.x以及Python3.7.x, 依赖PaddlePaddle 1.8版本以及CentOS系统环境, 安装请参考官网 [快速安装](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/index_cn.html)。 注意:该模型同时支持cpu和gpu训练和预测,用户可以根据自身需求,选择安装对应的paddlepaddle-gpu或paddlepaddle版本。 diff --git a/PaddleNLP/dialogue_domain_classification/run.sh b/PaddleNLP/dialogue_domain_classification/run.sh old mode 100755 new mode 100644 diff --git a/PaddleNLP/dialogue_domain_classification/run_classifier.py b/PaddleNLP/dialogue_domain_classification/run_classifier.py index af22ffbf..072e7da2 100755 --- a/PaddleNLP/dialogue_domain_classification/run_classifier.py +++ b/PaddleNLP/dialogue_domain_classification/run_classifier.py @@ -452,7 +452,7 @@ def main(args): if args.use_cuda: test_place = fluid.cuda_places(0) place = fluid.cuda_places() - DEV_COUNT = fluid.core.get_cuda_device_count() + DEV_COUNT = len(place) else: test_place = fluid.cpu_places(1) os.environ['CPU_NUM'] = str(args.cpu_num) diff --git a/PaddleNLP/dialogue_domain_classification/utils.py b/PaddleNLP/dialogue_domain_classification/utils.py index ac32225a..cf4382ae 100755 --- a/PaddleNLP/dialogue_domain_classification/utils.py +++ b/PaddleNLP/dialogue_domain_classification/utils.py @@ -130,12 +130,11 @@ class DataReader(object): assert os.path.exists(data_path), "The given data file does not exist." if mode == "train": train_reader = fluid.io.batch( - paddle.reader.shuffle( + fluid.io.shuffle( self.data_reader( data_path, self.max_len, shuffle=True), buf_size=batch_size * 100), batch_size) - # train_reader = fluid.io.batch(self.data_reader(data_path), batch_size) return train_reader else: test_reader = fluid.io.batch( diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/README.md b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/README.md index dd9dbff6..fa5ca4df 100644 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/README.md +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/README.md @@ -30,7 +30,7 @@ - cuda >= 9.0 - cudnn >= 7.0 - pandas >= 0.20.1 -- PaddlePaddle >= 1.7.0,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装, 本模块使用bert作为pretrain model进行模型的finetuning训练,训练速度较慢,建议安装GPU版本的PaddlePaddle +- PaddlePaddle >= 1.8.0,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装, 本模块使用bert作为pretrain model进行模型的finetuning训练,训练速度较慢,建议安装GPU版本的PaddlePaddle ####   b、下载代码 @@ -119,13 +119,10 @@ emb_size: embedding层大小 vocab_size: 词表大小 sample_pro: 采样比率 output_prediction_file: 输出的预测文件 -init_from_checkpoint: 加载断点模型 init_from_params: 训练好的模型参数文件,一般用于预测 init_from_pretrain_model: 预训练模型路径,如bert的模型参数 inference_model_dir: inference model的保存路径 save_model_path: 训练产出模型的输出路径 -save_checkpoint: 调用paddle的io接口save_persistables(把传入的层中所有参数以及优化器进行保存)来保存模型参数 -save_param: 调用paddle的io接口save_params(从main_program中取出所有参数然后保存到文件中)来保存模型参数 evaluation_file: 参与评估的inference 文件 vocab_path: 词表路径 max_seq_len: 输入最大序列长度 @@ -199,7 +196,6 @@ python -u main.py \ --loss_type="CLS" \ --max_seq_len=50 \ --save_model_path="data/saved_models/matching_pretrained" \ - --save_param="params" \ --training_file="data/input/data/unlabel_data/train.ids" \ --epoch=20 \ --print_step=1 \ @@ -217,7 +213,7 @@ python -u main.py \ #### windows环境下: 训练: ``` -python -u main.py --do_train=true --use_cuda=false --loss_type=CLS --max_seq_len=50 --save_model_path=data\saved_models\matching_pretrained --save_param=params --training_file=data\input\data\unlabel_data\train.ids --epoch=20 --print_step=1 --save_step=400 --batch_size=256 --hidden_size=256 --emb_size=256 --vocab_size=484016 --learning_rate=0.001 --sample_pro=0.1 +python -u main.py --do_train=true --use_cuda=false --loss_type=CLS --max_seq_len=50 --save_model_path=data\saved_models\matching_pretrained --training_file=data\input\data\unlabel_data\train.ids --epoch=20 --print_step=1 --save_step=400 --batch_size=256 --hidden_size=256 --emb_size=256 --vocab_size=484016 --learning_rate=0.001 --sample_pro=0.1 ``` #### 2、第二阶段finetuning模型的训练: @@ -271,9 +267,8 @@ python -u main.py \ --use_cuda=${use_cuda} \ --loss_type="L2" \ --max_seq_len=50 \ - --init_from_pretrain_model="data/saved_models/trained_models/matching_pretrained/params" \ + --init_from_pretrain_model="data/saved_models/trained_models/matching_pretrained/params/params" \ --save_model_path="data/saved_models/human_finetuned" \ - --save_param="params" \ --training_file="data/input/data/label_data/human/train.ids" \ --epoch=50 \ --print_step=1 \ @@ -288,7 +283,7 @@ python -u main.py \ #### windows环境下: ``` -python -u main.py --do_train=true --use_cuda=false --loss_type=L2 --max_seq_len=50 --save_model_path=data\saved_models\human_finetuned --save_param=params --training_file=data\input\data\label_data\human\train.ids --epoch=50 --print_step=1 --save_step=400 --batch_size=256 --hidden_size=256 --emb_size=256 --vocab_size=484016 --learning_rate=0.001 --sample_pro=0.1 +python -u main.py --do_train=true --use_cuda=false --loss_type=L2 --max_seq_len=50 --save_model_path=data\saved_models\human_finetuned --training_file=data\input\data\label_data\human\train.ids --epoch=50 --print_step=1 --save_step=400 --batch_size=256 --hidden_size=256 --emb_size=256 --vocab_size=484016 --learning_rate=0.001 --sample_pro=0.1 ``` ### 模型预测 diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/prepare_data_and_model.py b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/prepare_data_and_model.py index 539a1a1f..8406582c 100644 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/prepare_data_and_model.py +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/prepare_data_and_model.py @@ -29,7 +29,7 @@ DATA_MODEL_PATH = { "DATA_PATH": "https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_dataset-1.0.0.tar.gz", "TRAINED_MODEL": - "https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_models.2.0.0.tar.gz" + "https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_models.3.0.0.tar.gz" } PATH_MAP = {'DATA_PATH': "./data/input", 'TRAINED_MODEL': './data/saved_models'} diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade_net.py b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade_net.py index 10db9185..f3eb76af 100755 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade_net.py +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade_net.py @@ -34,7 +34,7 @@ def create_net(is_training, label = model_input.labels #emb - context_emb = fluid.input.embedding( + context_emb = fluid.embedding( input=context_wordseq, size=[args.vocab_size, args.emb_size], is_sparse=True, @@ -42,7 +42,7 @@ def create_net(is_training, name=word_emb_name, initializer=fluid.initializer.Normal(scale=0.1))) - response_emb = fluid.input.embedding( + response_emb = fluid.embedding( input=response_wordseq, size=[args.vocab_size, args.emb_size], is_sparse=True, diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/data/config/ade.yaml b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/data/config/ade.yaml index a901e091..24ac23de 100644 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/data/config/ade.yaml +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/data/config/ade.yaml @@ -14,13 +14,10 @@ emb_size: 256 vocab_size: 484016 sample_pro: 1.0 output_prediction_file: "" -init_from_checkpoint: "" init_from_params: "" init_from_pretrain_model: "" inference_model_dir: "" save_model_path: "" -save_checkpoint: "" -save_param: "" evaluation_file: "" vocab_path: "" max_seq_len: 128 diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/inference_model.py b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/inference_model.py index 5ef7bfc1..6c902d5f 100644 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/inference_model.py +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/inference_model.py @@ -27,7 +27,6 @@ from ade_net import create_net from ade.utils.configure import PDConfig from ade.utils.input_field import InputField from ade.utils.model_check import check_cuda -import ade.utils.save_load_io as save_load_io def do_save_inference_model(args): @@ -55,7 +54,7 @@ def do_save_inference_model(args): input_inst = [context_wordseq, response_wordseq, labels] input_field = InputField(input_inst) - data_reader = fluid.io.PyReader( + data_reader = fluid.io.DataLoader.from_generator( feed_list=input_inst, capacity=4, iterable=False) logits = create_net( @@ -72,9 +71,9 @@ def do_save_inference_model(args): assert (args.init_from_params) or (args.init_from_pretrain_model) if args.init_from_params: - save_load_io.init_from_params(args, exe, test_prog) + fluid.load(test_prog, args.init_from_params) elif args.init_from_pretrain_model: - save_load_io.init_from_pretrain_model(args, exe, test_prog) + fluid.load(test_prog, args.init_from_pretrain_model) # saving inference model fluid.io.save_inference_model( diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/predict.py b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/predict.py index 6f5a081f..75ad2c7a 100644 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/predict.py +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/predict.py @@ -29,7 +29,6 @@ from ade_net import create_net from ade.utils.configure import PDConfig from ade.utils.input_field import InputField from ade.utils.model_check import check_cuda -import ade.utils.save_load_io as save_load_io def do_predict(args): @@ -59,12 +58,11 @@ def do_predict(args): input_inst = [context_wordseq, response_wordseq, labels] input_field = InputField(input_inst) - data_reader = fluid.io.PyReader( + data_reader = fluid.io.DataLoader.from_generator( feed_list=input_inst, capacity=4, iterable=False) logits = create_net( is_training=False, model_input=input_field, args=args) - logits.persistable = True fetch_list = [logits.name] #for_test is True if change the is_test attribute of operators to True @@ -79,9 +77,9 @@ def do_predict(args): assert (args.init_from_params) or (args.init_from_pretrain_model) if args.init_from_params: - save_load_io.init_from_params(args, exe, test_prog) + fluid.load(test_prog, args.init_from_params, executor=exe) if args.init_from_pretrain_model: - save_load_io.init_from_pretrain_model(args, exe, test_prog) + fluid.load(test_prog, args.init_from_pretrain_model, executor=exe) compiled_test_prog = fluid.CompiledProgram(test_prog) @@ -94,7 +92,7 @@ def do_predict(args): place=place, phase="test", shuffle=False, sample_pro=1) num_test_examples = processor.get_num_examples(phase='test') - data_reader.decorate_batch_generator(batch_generator) + data_reader.set_batch_generator(batch_generator, places=place) data_reader.start() scores = [] @@ -110,7 +108,7 @@ def do_predict(args): print("Write the predicted results into the output_prediction_file") fw = io.open(args.output_prediction_file, 'w', encoding="utf8") for index, score in enumerate(scores): - fw.write("%s\t%s\n" % (index, score)) + fw.write(u"%s\t%s\n" % (index, score[0])) print("finish........................................") diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/run.sh b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/run.sh index 592fc6ce..d5610633 100755 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/run.sh +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/run.sh @@ -67,7 +67,6 @@ function pretrain_train() --loss_type="CLS" \ --max_seq_len=50 \ --save_model_path=${pretrain_model_path} \ - --save_param="params" \ --training_file="${INPUT_PATH}/unlabel_data/train.ids" \ --epoch=20 \ --print_step=1 \ @@ -99,9 +98,8 @@ function finetuning_train() --use_cuda=${1} \ --loss_type="L2" \ --max_seq_len=50 \ - --init_from_pretrain_model="${SAVED_MODELS}/matching_pretrained/params/step_final" \ + --init_from_pretrain_model="${SAVED_MODELS}/matching_pretrained/step_final" \ --save_model_path=${save_model_path} \ - --save_param="params" \ --training_file="${INPUT_PATH}/label_data/${2}/train.ids" \ --epoch=50 \ --print_step=1 \ @@ -121,7 +119,7 @@ function pretrain_predict() --do_predict=true \ --use_cuda=${1} \ --predict_file="${INPUT_PATH}/unlabel_data/test.ids" \ - --init_from_params="${SAVED_MODELS}/trained_models/matching_pretrained/params" \ + --init_from_params="${SAVED_MODELS}/trained_models/matching_pretrained/params/params" \ --loss_type="CLS" \ --output_prediction_file="${OUTPUT_PATH}/pretrain_matching_predict" \ --max_seq_len=50 \ @@ -137,7 +135,7 @@ function finetuning_predict() --do_predict=true \ --use_cuda=${1} \ --predict_file="${INPUT_PATH}/label_data/${2}/test.ids" \ - --init_from_params=${SAVED_MODELS}/trained_models/${2}_finetuned/params \ + --init_from_params="${SAVED_MODELS}/trained_models/${2}_finetuned/params/params" \ --loss_type="L2" \ --output_prediction_file="${OUTPUT_PATH}/finetuning_${2}_predict" \ --max_seq_len=50 \ diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/train.py b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/train.py index c1939866..3fffdbdc 100755 --- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/train.py +++ b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/train.py @@ -29,7 +29,6 @@ from ade_net import create_net, set_word_embedding from ade.utils.configure import PDConfig from ade.utils.input_field import InputField from ade.utils.model_check import check_cuda -import ade.utils.save_load_io as save_load_io try: import cPickle as pickle #python 2 @@ -62,24 +61,27 @@ def do_train(args): input_inst = [context_wordseq, response_wordseq, labels] input_field = InputField(input_inst) - data_reader = fluid.io.PyReader( + data_reader = fluid.io.DataLoader.from_generator( feed_list=input_inst, capacity=4, iterable=False) loss = create_net( is_training=True, model_input=input_field, args=args) - loss.persistable = True + # gradient clipping - fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByValue( - max=1.0, min=-1.0)) - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=args.learning_rate, + grad_clip=fluid.clip.GradientClipByValue( + max=1.0, min=-1.0)) optimizer.minimize(loss) if args.use_cuda: - dev_count = fluid.core.get_cuda_device_count() + places = fluid.cuda_places() + dev_count = len(places) place = fluid.CUDAPlace( int(os.getenv('FLAGS_selected_gpus', '0'))) else: - dev_count = int(os.environ.get('CPU_NUM', 1)) + places = fluid.cpu_places() + dev_count = len(places) place = fluid.CPUPlace() processor = reader.DataProcessor( @@ -99,20 +101,20 @@ def do_train(args): print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) - data_reader.decorate_batch_generator(batch_generator) + data_reader.set_batch_generator(batch_generator, places=place) exe = fluid.Executor(place) exe.run(startup_prog) - assert (args.init_from_checkpoint == "") or ( + assert (args.init_from_params == "") or ( args.init_from_pretrain_model == "") #init from some checkpoint, to resume the previous training - if args.init_from_checkpoint: - save_load_io.init_from_checkpoint(args, exe, train_prog) + if args.init_from_params: + fluid.load(train_prog, args.init_from_params, exe) #init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: - save_load_io.init_from_pretrain_model(args, exe, train_prog) + fluid.load(train_prog, args.init_from_pretrain_model, exe) if args.word_emb_init: print("start loading word embedding init ...") @@ -163,21 +165,17 @@ def do_train(args): time_begin = time.time() if steps % args.save_steps == 0: - if args.save_checkpoint: - save_load_io.save_checkpoint(args, exe, train_prog, - "step_" + str(steps)) - if args.save_param: - save_load_io.save_param(args, exe, train_prog, - "step_" + str(steps)) + model_path = os.path.join(args.save_model_path, + "step_" + str(steps)) + fluid.save(train_prog, model_path) + steps += 1 except fluid.core.EOFException: data_reader.reset() break - if args.save_checkpoint: - save_load_io.save_checkpoint(args, exe, train_prog, "step_final") - if args.save_param: - save_load_io.save_param(args, exe, train_prog, "step_final") + model_path = os.path.join(args.save_model_path, "step_final") + fluid.save(train_prog, model_path) def get_cards(): num = 0 diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/README.md b/PaddleNLP/dialogue_system/dialogue_general_understanding/README.md index 7bc45c08..0bbb11ec 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/README.md +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/README.md @@ -23,7 +23,7 @@ - Python >= 2.7 - cuda >= 9.0 - cudnn >= 7.0 -- PaddlePaddle >= 1.7.0,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装, 由于模块内模型基于bert做finetuning, 训练速度较慢, 建议用户安装GPU版本PaddlePaddle进行训练。 +- PaddlePaddle >= 1.8.0,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装, 由于模块内模型基于bert做finetuning, 训练速度较慢, 建议用户安装GPU版本PaddlePaddle进行训练。 ####   b、下载代码 @@ -123,13 +123,10 @@ format:conversation_content \t question \1 answer \t state1 state2 state3..... task_name: 任务名称,可选udc、swda、mrda、atis_intent、atis_slot、dstc2 data_dir: 数据路径,如./data/input/data/udc bert_config_path: 预训练模型bert的网络配置./data/pretrain_model/uncased_L-12_H-768_A-12/bert_config.json -init_from_checkpoint: 加载断点模型 init_from_params: 训练好的模型参数文件,一般用于预测 init_from_pretrain_model: 预训练模型路径,如bert的模型参数 inference_model_dir: inference model的保存路径 save_model_path: 训练产出模型的输出路径 -save_checkpoint: 调用paddle的io接口save_persistables(把传入的层中所有参数以及优化器进行保存)来保存模型参数 -save_param: 调用paddle的io接口save_params(从main_program中取出所有参数然后保存到文件中)来保存模型参数 lr_scheduler: learning rate scheduler weight_decay: learning rate 权重衰减因子 warmup_proportion: warmup比率 @@ -221,7 +218,6 @@ python -u main.py \ --vocab_path="${BERT_BASE_PATH}/vocab.txt" \ --init_from_pretrain_model="${BERT_BASE_PATH}/params" \ --save_model_path="./data/saved_models/${TASK_NAME}" \ - --save_param="params" \ --save_steps=100 \ --learning_rate=2e-5 \ --weight_decay=0.01 \ @@ -235,7 +231,7 @@ python -u main.py \ #### windows环境下 ``` -python -u main.py --task_name=atis_intent --use_cuda=false --do_train=true --epoch=20 --batch_size=32 --do_lower_case=true --data_dir=data\input\data\atis\atis_intent --bert_config_path=data\pretrain_model\uncased_L-12_H-768_A-12\bert_config.json --vocab_path=data\pretrain_model\uncased_L-12_H-768_A-12\vocab.txt --init_from_pretrain_model=data\pretrain_model\uncased_L-12_H-768_A-12\params --save_model_path=data\saved_models\atis_intent --save_param=params --save_steps=100 --learning_rate=2e-5 --weight_decay=0.01 --max_seq_len=128 --print_steps=10 +python -u main.py --task_name=atis_intent --use_cuda=false --do_train=true --epoch=20 --batch_size=32 --do_lower_case=true --data_dir=data\input\data\atis\atis_intent --bert_config_path=data\pretrain_model\uncased_L-12_H-768_A-12\bert_config.json --vocab_path=data\pretrain_model\uncased_L-12_H-768_A-12\vocab.txt --init_from_pretrain_model=data\pretrain_model\uncased_L-12_H-768_A-12\params --save_model_path=data\saved_models\atis_intent --save_steps=100 --learning_rate=2e-5 --weight_decay=0.01 --max_seq_len=128 --print_steps=10 ``` ### 模型预测 @@ -294,7 +290,7 @@ python -u main.py \ --batch_size=32 \ --do_lower_case=true \ --data_dir="./data/input/data/atis/${TASK_NAME}" \ - --init_from_params="./data/saved_models/trained_models/${TASK_NAME}/params" \ + --init_from_params="./data/saved_models/trained_models/${TASK_NAME}/params/params" \ --bert_config_path="${BERT_BASE_PATH}/bert_config.json" \ --vocab_path="${BERT_BASE_PATH}/vocab.txt" \ --output_prediction_file="./data/output/pred_${TASK_NAME}" \ @@ -305,7 +301,7 @@ python -u main.py \ #### windows环境下 ``` -python -u main.py --task_name=atis_intent --use_cuda=false --do_predict=true --batch_size=32 --do_lower_case=true --data_dir=data\input\data\atis\atis_intent --init_from_params=data\saved_models\trained_models\atis_intent\params --bert_config_path=data\pretrain_model\uncased_L-12_H-768_A-12\bert_config.json --vocab_path=data\pretrain_model\uncased_L-12_H-768_A-12\vocab.txt --output_prediction_file=data\output\pred_atis_intent --max_seq_len=128 +python -u main.py --task_name=atis_intent --use_cuda=false --do_predict=true --batch_size=32 --do_lower_case=true --data_dir=data\input\data\atis\atis_intent --init_from_params=data\saved_models\trained_models\atis_intent\params\params --bert_config_path=data\pretrain_model\uncased_L-12_H-768_A-12\bert_config.json --vocab_path=data\pretrain_model\uncased_L-12_H-768_A-12\vocab.txt --output_prediction_file=data\output\pred_atis_intent --max_seq_len=128 ``` ### 模型评估 diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/data/config/dgu.yaml b/PaddleNLP/dialogue_system/dialogue_general_understanding/data/config/dgu.yaml index 3857dab2..fa3114cb 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/data/config/dgu.yaml +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/data/config/dgu.yaml @@ -1,13 +1,10 @@ task_name: "" data_dir: "" bert_config_path: "" -init_from_checkpoint: "" init_from_params: "" init_from_pretrain_model: "" inference_model_dir: "" save_model_path: "" -save_checkpoint: "" -save_param: "" lr_scheduler: "linear_warmup_decay" weight_decay: 0.01 warmup_proportion: 0.1 diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/bert.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/bert.py index 6bab7466..0e62ffa7 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/bert.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/bert.py @@ -87,21 +87,21 @@ class BertModel(object): def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): # padding id in vocabulary must be set to 0 - emb_out = fluid.input.embedding( + emb_out = fluid.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) - position_emb_out = fluid.input.embedding( + position_emb_out = fluid.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr( name=self._pos_emb_name, initializer=self._param_initializer)) - sent_emb_out = fluid.input.embedding( + sent_emb_out = fluid.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._dtype, diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_paradigm.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_paradigm.py index df4cecf8..7a112c34 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_paradigm.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_paradigm.py @@ -48,8 +48,8 @@ class Paradigm(object): initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="cls_out_b", initializer=fluid.initializer.Constant(0.))) - - if not params['is_training']: + + if not params['is_training']: probs = fluid.layers.softmax(logits) results = {"probs": probs} return results diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_predict_pack.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_predict_pack.py index 13b14f81..9053381e 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_predict_pack.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/define_predict_pack.py @@ -17,7 +17,6 @@ import re import sys import numpy as np -import paddle import paddle.fluid as fluid diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/optimization.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/optimization.py index 63cfc9a8..ce9da965 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/optimization.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/optimization.py @@ -59,7 +59,13 @@ def optimization(loss, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, - loss_scaling=1.0): + loss_scaling=1.0, + clip_norm_thres=1.0): + # When using mixed precision training, scale the gradient clip threshold + # by loss_scaling + if use_fp16 and loss_scaling > 1.0: + clip_norm_thres *= loss_scaling + if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = fluid.layers.learning_rate_scheduler\ @@ -71,19 +77,17 @@ def optimization(loss, else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") - optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=scheduled_lr, + grad_clip=fluid.clip.GradientClipByGlobalNorm( + clip_norm=clip_norm_thres)) else: - optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) + optimizer = fluid.optimizer.AdamOptimizer( + learning_rate=learning_rate, + grad_clip=fluid.clip.GradientClipByGlobalNorm( + clip_norm=clip_norm_thres)) scheduled_lr = learning_rate - clip_norm_thres = 1.0 - # When using mixed precision training, scale the gradient clip threshold - # by loss_scaling - if use_fp16 and loss_scaling > 1.0: - clip_norm_thres *= loss_scaling - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) - def exclude_from_weight_decay(name): if name.find("layer_norm") > -1: return True diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/prepare_data_and_model.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/prepare_data_and_model.py index 83c72064..aea3c947 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/prepare_data_and_model.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/prepare_data_and_model.py @@ -29,7 +29,7 @@ DATA_MODEL_PATH = { "DATA_PATH": "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz", "PRETRAIN_MODEL": "https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz", - "TRAINED_MODEL": "https://baidu-nlp.bj.bcebos.com/dgu_models_2.0.0.tar.gz" + "TRAINED_MODEL": "https://baidu-nlp.bj.bcebos.com/dgu_models_3.0.0.tar.gz" } PATH_MAP = { diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/inference_model.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/inference_model.py index 438ebcc7..d1786b32 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/inference_model.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/inference_model.py @@ -25,7 +25,6 @@ import paddle.fluid as fluid from dgu.utils.configure import PDConfig from dgu.utils.input_field import InputField from dgu.utils.model_check import check_cuda -import dgu.utils.save_load_io as save_load_io import dgu.reader as reader from dgu_net import create_net @@ -97,12 +96,10 @@ def do_save_inference_model(args): exe = fluid.Executor(place) exe.run(startup_prog) - assert (args.init_from_params) or (args.init_from_pretrain_model) + assert (args.init_from_params) if args.init_from_params: - save_load_io.init_from_params(args, exe, test_prog) - elif args.init_from_pretrain_model: - save_load_io.init_from_pretrain_model(args, exe, test_prog) + fluid.load(test_prog, args.init_from_params) # saving inference model fluid.io.save_inference_model( diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/main.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/main.py index bf1cf3b2..e63544c3 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/main.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/main.py @@ -16,7 +16,6 @@ import os import sys import numpy as np -import paddle import paddle.fluid as fluid from eval import do_eval diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/predict.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/predict.py index 0d781b0f..0c530e99 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/predict.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/predict.py @@ -19,7 +19,6 @@ import sys import numpy as np import argparse import collections -import paddle import paddle.fluid as fluid import dgu.reader as reader @@ -30,7 +29,6 @@ import dgu.define_predict_pack as define_predict_pack from dgu.utils.configure import PDConfig from dgu.utils.input_field import InputField from dgu.utils.model_check import check_cuda -import dgu.utils.save_load_io as save_load_io from dgu.utils.py23 import tab_tok, rt_tok @@ -84,7 +82,7 @@ def do_predict(args): input_inst = [src_ids, pos_ids, sent_ids, input_mask, labels] input_field = InputField(input_inst) - data_reader = fluid.io.PyReader( + data_reader = fluid.io.DataLoader.from_generator( feed_list=input_inst, capacity=4, iterable=False) results = create_net( @@ -95,9 +93,6 @@ def do_predict(args): args=args) probs = results.get("probs", None) - - probs.persistable = True - fetch_list = [probs.name] #for_test is True if change the is_test attribute of operators to True @@ -111,12 +106,10 @@ def do_predict(args): exe = fluid.Executor(place) exe.run(startup_prog) - assert (args.init_from_params) or (args.init_from_pretrain_model) + assert (args.init_from_params) if args.init_from_params: - save_load_io.init_from_params(args, exe, test_prog) - if args.init_from_pretrain_model: - save_load_io.init_from_pretrain_model(args, exe, test_prog) + fluid.load(test_prog, args.init_from_params) compiled_test_prog = fluid.CompiledProgram(test_prog) @@ -130,7 +123,7 @@ def do_predict(args): batch_generator = processor.data_generator( batch_size=args.batch_size, phase='test', shuffle=False) - data_reader.decorate_batch_generator(batch_generator) + data_reader.set_batch_generator(batch_generator, places=place) data_reader.start() all_results = [] diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/run.sh b/PaddleNLP/dialogue_system/dialogue_general_understanding/run.sh index 3759d859..e694adfc 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/run.sh +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/run.sh @@ -3,7 +3,7 @@ export FLAGS_sync_nccl_allreduce=0 export FLAGS_eager_delete_tensor_gb=1 -export CUDA_VISIBLE_DEVICES=1 +export CUDA_VISIBLE_DEVICES= if [ ! "$CUDA_VISIBLE_DEVICES" ] then export CPU_NUM=1 @@ -21,7 +21,7 @@ SAVE_MODEL_PATH="./data/saved_models/${TASK_NAME}" TRAIN_MODEL_PATH="./data/saved_models/trained_models" OUTPUT_PATH="./data/output" INFERENCE_MODEL="data/inference_models" -PYTHON_PATH="python3" +PYTHON_PATH="python" if [ -f ${SAVE_MODEL_PATH} ]; then rm ${SAVE_MODEL_PATH} @@ -94,7 +94,6 @@ else exit 255 fi - #training function train() { @@ -110,7 +109,6 @@ function train() --vocab_path=${BERT_BASE_PATH}/vocab.txt \ --init_from_pretrain_model=${BERT_BASE_PATH}/params \ --save_model_path=${SAVE_MODEL_PATH} \ - --save_param="params" \ --save_steps=${save_steps} \ --learning_rate=${learning_rate} \ --weight_decay=0.01 \ @@ -128,7 +126,7 @@ function predict() --batch_size=${batch_size} \ --data_dir=${INPUT_PATH} \ --do_lower_case=true \ - --init_from_params=${TRAIN_MODEL_PATH}/${TASK_NAME}/params \ + --init_from_params=${TRAIN_MODEL_PATH}/${TASK_NAME}/params/params \ --bert_config_path=${BERT_BASE_PATH}/bert_config.json \ --vocab_path=${BERT_BASE_PATH}/vocab.txt \ --output_prediction_file=${OUTPUT_PATH}/pred_${TASK_NAME} \ diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/train.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/train.py index 5d9d14ec..4744428b 100644 --- a/PaddleNLP/dialogue_system/dialogue_general_understanding/train.py +++ b/PaddleNLP/dialogue_system/dialogue_general_understanding/train.py @@ -22,7 +22,6 @@ import sys import time import numpy as np -import paddle import paddle.fluid as fluid from dgu_net import create_net @@ -32,7 +31,6 @@ import dgu.define_paradigm as define_paradigm from dgu.utils.configure import PDConfig from dgu.utils.input_field import InputField from dgu.utils.model_check import check_cuda -import dgu.utils.save_load_io as save_load_io def do_train(args): @@ -80,8 +78,9 @@ def do_train(args): input_inst = [src_ids, pos_ids, sent_ids, input_mask, labels] input_field = InputField(input_inst) - - data_reader = fluid.io.DataLoader.from_generator(feed_list=input_inst, capacity=4, iterable=False) + + data_reader = fluid.io.DataLoader.from_generator( + feed_list=input_inst, capacity=4, iterable=False) processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, @@ -103,13 +102,8 @@ def do_train(args): accuracy = results.get("accuracy", None) num_seqs = results.get("num_seqs", None) - loss.persistable = True - probs.persistable = True - if accuracy: - accuracy.persistable = True - num_seqs.persistable = True - - places = fluid.cuda_places() if args.use_cuda else fluid.cpu_places() + places = fluid.cuda_places() if args.use_cuda else fluid.cpu_places( + ) dev_count = len(places) batch_generator = processor.data_generator( @@ -149,16 +143,13 @@ def do_train(args): exe = fluid.Executor(place) exe.run(startup_prog) - assert (args.init_from_checkpoint == "") or ( - args.init_from_pretrain_model == "") + assert args.init_from_params or args.init_from_pretrain_model # init from some checkpoint, to resume the previous training - if args.init_from_checkpoint: - save_load_io.init_from_checkpoint(args, exe, train_prog) - - # init from some pretrain models, to better solve the current task + if args.init_from_params: + fluid.load(train_prog, args.init_from_params, exe) if args.init_from_pretrain_model: - save_load_io.init_from_pretrain_model(args, exe, train_prog) + fluid.load(train_prog, args.init_from_pretrain_model, exe) build_strategy = fluid.compiler.BuildStrategy() build_strategy.enable_inplace = True @@ -234,21 +225,16 @@ def do_train(args): time_begin = time.time() if steps % args.save_steps == 0: - save_path = "step_" + str(steps) - if args.save_checkpoint: - save_load_io.save_checkpoint(args, exe, train_prog, - save_path) - if args.save_param: - save_load_io.save_param(args, exe, train_prog, - save_path) + model_path = os.path.join(args.save_model_path, + "step_" + str(steps)) + fluid.save(train_prog, model_path) except fluid.core.EOFException: data_reader.reset() break - if args.save_checkpoint: - save_load_io.save_checkpoint(args, exe, train_prog, "step_final") - if args.save_param: - save_load_io.save_param(args, exe, train_prog, "step_final") + + model_path = os.path.join(args.save_model_path, "step_final") + fluid.save(train_prog, model_path) def get_cards(): num = 0 -- GitLab