diff --git a/PaddleNLP/README.md b/PaddleNLP/README.md index 5274a054a5114bc1b04cb90180fb98290dea8e25..4307bf57a301291df1e772d72b83f5fc480f3f25 100644 --- a/PaddleNLP/README.md +++ b/PaddleNLP/README.md @@ -84,7 +84,7 @@ cd models/PaddleNLP/sentiment_classification - [机器翻译](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/neural_machine_translation/transformer) ### 语义表示与语言模型 - - [语言表示工具箱](https://github.com/PaddlePaddle/LARK/tree/develop) + - [语言表示工具箱](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/language_representations_kit) - [语言模型](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/language_model) ### 复杂任务 diff --git a/PaddleNLP/language_representations_kit/BERT/README.md b/PaddleNLP/language_representations_kit/BERT/README.md index 8bdd4fb9a1230c82e75614f9cbe7fc7b0b5642c2..2f38c2f1eb6b7e52558a6ced67250270469a5c53 100644 --- a/PaddleNLP/language_representations_kit/BERT/README.md +++ b/PaddleNLP/language_representations_kit/BERT/README.md @@ -18,6 +18,8 @@ | Model | Layers | Hidden size | Heads |Parameters | | :------| :------: | :------: |:------: |:------: | +| [BERT-Large, Uncased (Whole Word Masking)](https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz)| 24 | 1024 | 16 | 340M | +| [BERT-Large, Cased (Whole Word Masking)](https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz)| 24 | 1024 | 16 | 340M | | [BERT-Base, Uncased](https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz) | 12 | 768 |12 |110M | | [BERT-Large, Uncased](https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz) | 24 | 1024 |16 |340M | |[BERT-Base, Cased](https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz)|12|768|12|110M| @@ -46,7 +48,7 @@ - [inference 接口调用示例](#inference-接口调用示例) ## 安装 -本项目依赖于 Paddle Fluid **1.3.1**,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 +本项目依赖于 Paddle Fluid **1.5.1**,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。如果需要进行 TensorFlow 模型到 Paddle Fluid 参数的转换,则需要同时安装 TensorFlow 1.12。 ## 预训练 @@ -138,22 +140,24 @@ python -u run_classifier.py --task_name ${TASK_NAME} \ --do_train true \ --do_val true \ --do_test true \ - --batch_size 8192 \ - --in_tokens true \ + --batch_size 32 \ + --in_tokens false \ --init_pretraining_params ${BERT_BASE_PATH}/params \ --data_dir ${DATA_PATH} \ --vocab_path ${BERT_BASE_PATH}/vocab.txt \ --checkpoints ${CKPT_PATH} \ --save_steps 1000 \ --weight_decay 0.01 \ - --warmup_proportion 0.0 \ - --validation_steps 25 \ + --warmup_proportion 0.1 \ + --validation_steps 100 \ --epoch 3 \ - --max_seq_len 512 \ + --max_seq_len 128 \ --bert_config_path ${BERT_BASE_PATH}/bert_config.json \ - --learning_rate 1e-4 \ + --learning_rate 5e-5 \ --skip_steps 10 \ - --random_seed 1 + --num_iteration_per_drop_scope 10 \ + --use_fp16 true \ + --verbose true ``` 这里的 `chinese_L-12_H-768_A-12` 即是转换后的中文预训练模型。需要注意的是,BERT on PaddlePaddle 支持按两种方式构建一个 batch 的数据,`in_tokens` 参数影响 `batch_size` 参数的意义,如果 `in_tokens` 为 `true` 则按照 token 个数构建 batch, 如不设定则按照 example 个数来构建 batch. 训练过程中会输出训练误差、训练速度等信息,训练结束后会输出如下所示的在验证集上的测试结果: diff --git a/PaddleNLP/language_representations_kit/BERT/model/classifier.py b/PaddleNLP/language_representations_kit/BERT/model/classifier.py index daaf75e4b12f3457727c5deda05a811916c63e75..8df82a34f4cc76c1309ac69f98db33d8967ae991 100644 --- a/PaddleNLP/language_representations_kit/BERT/model/classifier.py +++ b/PaddleNLP/language_representations_kit/BERT/model/classifier.py @@ -22,22 +22,27 @@ import paddle.fluid as fluid from model.bert import BertModel -def create_model(args, - pyreader_name, - bert_config, - num_labels, - is_prediction=False): - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], - dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], - lod_levels=[0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) +def create_model(args, bert_config, num_labels, is_prediction=False): + input_fields = { + 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], + 'shapes': + [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], + 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], + 'lod_levels': [0, 0, 0, 0, 0], + } - (src_ids, pos_ids, sent_ids, input_mask, - labels) = fluid.layers.read_file(pyreader) + inputs = [ + fluid.layers.data( + name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) + for i in range(len(input_fields['names'])) + ] + (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs + + pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) bert = BertModel( src_ids=src_ids, diff --git a/PaddleNLP/language_representations_kit/BERT/predict_classifier.py b/PaddleNLP/language_representations_kit/BERT/predict_classifier.py index 6b4f37ac9249f6d1903632ed19ff83f56b388a70..0f0f842865549b6c6c2ba97dce37f9007e39c697 100644 --- a/PaddleNLP/language_representations_kit/BERT/predict_classifier.py +++ b/PaddleNLP/language_representations_kit/BERT/predict_classifier.py @@ -84,7 +84,6 @@ def main(args): with fluid.unique_name.guard(): predict_pyreader, probs, feed_target_names = create_model( args, - pyreader_name='predict_reader', bert_config=bert_config, num_labels=num_labels, is_prediction=True) @@ -103,7 +102,7 @@ def main(args): exe.run(predict_startup) if args.init_checkpoint: - init_pretraining_params(exe, args.init_checkpoint, predict_prog) + init_pretraining_params(exe, args.init_checkpoint, predict_prog, args.use_fp16) else: raise ValueError("args 'init_checkpoint' should be set for prediction!") @@ -113,7 +112,7 @@ def main(args): predict_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=predict_prog) - predict_pyreader.decorate_tensor_provider( + predict_pyreader.decorate_batch_generator( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) diff --git a/PaddleNLP/language_representations_kit/BERT/run_classifier.py b/PaddleNLP/language_representations_kit/BERT/run_classifier.py index e8583587e64b6d7bf67bdbfaf6150ee1be33502e..1fa1df6f3e2c93a2fbe79f2fe73998bee56ed7db 100644 --- a/PaddleNLP/language_representations_kit/BERT/run_classifier.py +++ b/PaddleNLP/language_representations_kit/BERT/run_classifier.py @@ -193,7 +193,6 @@ def main(args): with fluid.unique_name.guard(): train_pyreader, loss, probs, accuracy, num_seqs = create_model( args, - pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels) scheduled_lr = optimization( @@ -219,17 +218,41 @@ def main(args): print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) - if args.do_val or args.do_test: + if args.do_val: + dev_prog = fluid.Program() + with fluid.program_guard(dev_prog, startup_prog): + with fluid.unique_name.guard(): + dev_pyreader, loss, probs, accuracy, num_seqs = create_model( + args, + bert_config=bert_config, + num_labels=num_labels) + + dev_prog = dev_prog.clone(for_test=True) + dev_pyreader.decorate_batch_generator( + processor.data_generator( + batch_size=args.batch_size, + phase='dev', + epoch=1, + dev_count=1, + shuffle=False), place) + + if args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, loss, probs, accuracy, num_seqs = create_model( args, - pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) + test_pyreader.decorate_batch_generator( + processor.data_generator( + batch_size=args.batch_size, + phase='test', + epoch=1, + dev_count=1, + shuffle=False), place) exe.run(startup_prog) @@ -276,7 +299,7 @@ def main(args): train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) - train_pyreader.decorate_tensor_provider(train_data_generator) + train_pyreader.decorate_batch_generator(train_data_generator, place) if args.do_train: @@ -350,25 +373,11 @@ def main(args): throughput = [] # evaluate dev set if args.do_val: - test_pyreader.decorate_tensor_provider( - processor.data_generator( - batch_size=args.batch_size, - phase='dev', - epoch=1, - dev_count=1, - shuffle=False)) - evaluate(exe, test_prog, test_pyreader, + evaluate(exe, dev_prog, dev_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: - test_pyreader.decorate_tensor_provider( - processor.data_generator( - batch_size=args.batch_size, - phase='test', - epoch=1, - dev_count=1, - shuffle=False)) evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test") @@ -398,23 +407,12 @@ def main(args): # final eval on dev set if args.do_val: - test_pyreader.decorate_tensor_provider( - processor.data_generator( - batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, - shuffle=False)) print("Final validation result:") - evaluate(exe, test_prog, test_pyreader, + evaluate(exe, dev_prog, dev_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: - test_pyreader.decorate_tensor_provider( - processor.data_generator( - batch_size=args.batch_size, - phase='test', - epoch=1, - dev_count=1, - shuffle=False)) print("Final test result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "test") diff --git a/PaddleNLP/language_representations_kit/BERT/run_squad.py b/PaddleNLP/language_representations_kit/BERT/run_squad.py index 514b815878cc9862c2b1fa98f69bacebfc176fc0..211a35fb51b5e6700fd84d2926dd6f5f2e33fe30 100644 --- a/PaddleNLP/language_representations_kit/BERT/run_squad.py +++ b/PaddleNLP/language_representations_kit/BERT/run_squad.py @@ -92,31 +92,39 @@ run_type_g.add_arg("do_predict", bool, True, "Whether to pe args = parser.parse_args() # yapf: enable. -def create_model(pyreader_name, bert_config, is_training=False): +def create_model(bert_config, is_training=False): if is_training: - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + input_fields = { + 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions'], + 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], - dtypes=[ + 'dtypes': [ 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], - lod_levels=[0, 0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) - (src_ids, pos_ids, sent_ids, input_mask, start_positions, - end_positions) = fluid.layers.read_file(pyreader) + 'lod_levels': [0, 0, 0, 0, 0, 0], + } else: - pyreader = fluid.layers.py_reader( - capacity=50, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + input_fields = { + 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'], + 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], - dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], - lod_levels=[0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) - (src_ids, pos_ids, sent_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader) + 'dtypes': [ + 'int64', 'int64', 'int64', 'float32', 'int64'], + 'lod_levels': [0, 0, 0, 0, 0], + } + + inputs = [fluid.layers.data(name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))] + + pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) + + if is_training: + (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = inputs + else: + (src_ids, pos_ids, sent_ids, input_mask, unique_id) = inputs bert = BertModel( src_ids=src_ids, @@ -263,7 +271,6 @@ def train(args): with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, loss, num_seqs = create_model( - pyreader_name='train_reader', bert_config=bert_config, is_training=True) @@ -296,7 +303,6 @@ def train(args): with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, unique_ids, start_logits, end_logits, num_seqs = create_model( - pyreader_name='test_reader', bert_config=bert_config, is_training=False) @@ -341,7 +347,7 @@ def train(args): train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( loss_name=loss.name, exec_strategy=exec_strategy) - train_pyreader.decorate_tensor_provider(train_data_generator) + train_pyreader.decorate_batch_generator(train_data_generator, place) train_pyreader.start() steps = 0 @@ -402,14 +408,14 @@ def train(args): break if args.do_predict: - test_pyreader.decorate_tensor_provider( + test_pyreader.decorate_batch_generator( processor.data_generator( data_path=args.predict_file, batch_size=args.batch_size, phase='predict', shuffle=False, dev_count=1, - epoch=1)) + epoch=1), place) predict(exe, test_prog, test_pyreader, [ unique_ids.name, start_logits.name, end_logits.name, num_seqs.name diff --git a/PaddleNLP/language_representations_kit/BERT/train.py b/PaddleNLP/language_representations_kit/BERT/train.py index 2d95568e27b446bd10932ef902a4110a32b32f02..85833f57fccce7e2bf84f7c8e067bd88244d2714 100644 --- a/PaddleNLP/language_representations_kit/BERT/train.py +++ b/PaddleNLP/language_representations_kit/BERT/train.py @@ -82,21 +82,24 @@ args = parser.parse_args() # yapf: enable. -def create_model(pyreader_name, bert_config): - pyreader = fluid.layers.py_reader( - capacity=70, - shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], +def create_model(bert_config): + input_fields = { + 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'mask_label', 'mask_pos', 'labels'], + 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], - [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], - [-1, 1]], - dtypes=[ - 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64' - ], - lod_levels=[0, 0, 0, 0, 0, 0, 0], - name=pyreader_name, - use_double_buffer=True) + [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]], + 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'], + 'lod_levels': [0, 0, 0, 0, 0, 0, 0], + } - (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = fluid.layers.read_file(pyreader) + inputs = [fluid.layers.data(name=input_fields['names'][i], + shape=input_fields['shapes'][i], + dtype=input_fields['dtypes'][i], + lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names']))] + + (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels) = inputs + + pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) bert = BertModel( src_ids=src_ids, @@ -143,7 +146,7 @@ def predict_wrapper(args, def predict(exe=exe, pyreader=pyreader): - pyreader.decorate_tensor_provider(data_reader.data_generator()) + pyreader.decorate_batch_generator(data_reader.data_generator()) pyreader.start() cost = 0 @@ -181,7 +184,7 @@ def test(args): with fluid.program_guard(test_prog, test_startup): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( - pyreader_name='test_reader', bert_config=bert_config) + bert_config=bert_config) test_prog = test_prog.clone(for_test=True) @@ -216,7 +219,7 @@ def train(args): with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( - pyreader_name='train_reader', bert_config=bert_config) + bert_config=bert_config) scheduled_lr = optimization( loss=total_loss, warmup_steps=args.warmup_steps, @@ -229,17 +232,11 @@ def train(args): use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) - fluid.memory_optimize( - input_program=train_program, - skip_opt_set=[ - next_sent_acc.name, mask_lm_loss.name, total_loss.name - ]) - test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( - pyreader_name='test_reader', bert_config=bert_config) + bert_config=bert_config) test_prog = test_prog.clone(for_test=True) @@ -313,18 +310,16 @@ def train(args): exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope + build_strategy = fluid.BuildStrategy() + build_strategy.num_trainers = nccl2_num_trainers + build_strategy.trainer_id = nccl2_trainer_id # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: - train_exe = fluid.ParallelExecutor( - use_cuda=args.use_cuda, - loss_name=total_loss.name, - exec_strategy=exec_strategy, - main_program=train_program, - num_trainers=nccl2_num_trainers, - trainer_id=nccl2_trainer_id) - else: - train_exe = exe + train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( + loss_name=total_loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper( @@ -337,7 +332,7 @@ def train(args): next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) - train_pyreader.decorate_tensor_provider(data_reader.data_generator()) + train_pyreader.decorate_batch_generator(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] @@ -351,28 +346,28 @@ def train(args): if nccl2_trainer_id != 0: if use_ngraph: - train_exe.run(fetch_list=[], program=train_program) + exe.run(fetch_list=[], program=train_program) else: - train_exe.run(fetch_list=[]) + exe.run(fetch_list=[], program=train_compiled_program) continue if steps % skip_steps != 0: if use_ngraph: - train_exe.run(fetch_list=[], program=train_program) + exe.run(fetch_list=[], program=train_program) else: - train_exe.run(fetch_list=[]) + exe.run(fetch_list=[], program=train_compiled_program) else: if use_ngraph: - each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( + each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name], program=train_program) else: - each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( + each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, - scheduled_lr.name]) + scheduled_lr.name], program=train_compiled_program) acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost)