diff --git a/BERT/README.md b/BERT/README.md index a6ef69f3a1f00232c5a6926ae06fb10ea8fb93d5..bf7bdcf4ee8f2b1ac8c7cf6a34f48646a1657d69 100644 --- a/BERT/README.md +++ b/BERT/README.md @@ -122,8 +122,8 @@ export current_endpoint=192.168.0.17:9185 对于 [GLUE 数据](https://gluebenchmark.com/tasks),请运行这个[脚本](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)予以下载; 对于 XNLI 任务,则需分别下载 [XNLI dev/test set](https://bert-data.bj.bcebos.com/XNLI-1.0.zip) 和 [XNLI machine-translated training set](https://bert-data.bj.bcebos.com/XNLI-MT-1.0.zip),然后解压到同一个目录。以 XNLI 任务为例,启动 Fine-tuning 的方式如下: ```shell -export FLAGS_enable_parallel_graph=1 -export FLAGS_sync_nccl_allreduce=1 +export FLAGS_sync_nccl_allreduce=0 +export FLAGS_eager_delete_tensor_gb=1 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 BERT_BASE_PATH="chinese_L-12_H-768_A-12" @@ -183,8 +183,8 @@ SQuAD v1.1 对于 SQuAD v1.1, 按如下方式启动 Fine-tuning: ```shell -export FLAGS_enable_parallel_graph=1 -export FLAGS_sync_nccl_allreduce=1 +export FLAGS_sync_nccl_allreduce=0 +export FLAGS_eager_delete_tensor_gb=1 export CUDA_VISIBLE_DEVICES=0,1,2,3 BERT_BASE_PATH="uncased_L-12_H-768_A-12" @@ -229,6 +229,8 @@ python ${SQUAD_PATH}/evaluate-v1.1.py ${SQUAD_PATH}/dev-v1.1.json ${CHECKPOINT_P 对于 SQuAD v2.0, 按如下方式启动 Fine-tuning: ```shell +export FLAGS_sync_nccl_allreduce=0 +export FLAGS_eager_delete_tensor_gb=1 export CUDA_VISIBLE_DEVICES=0,1,2,3 BERT_BASE_PATH="uncased_L-12_H-768_A-12" CHECKPOINT_PATH=/path/to/save/checkpoints/ diff --git a/BERT/run_classifier.py b/BERT/run_classifier.py index fd0c592789baaa9c99cd8964ba6eff4b3076dadc..e8583587e64b6d7bf67bdbfaf6150ee1be33502e 100644 --- a/BERT/run_classifier.py +++ b/BERT/run_classifier.py @@ -208,12 +208,6 @@ def main(args): use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) - fluid.memory_optimize( - input_program=train_program, - skip_opt_set=[ - loss.name, probs.name, accuracy.name, num_seqs.name - ]) - if args.verbose: if args.in_tokens: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( @@ -279,22 +273,11 @@ def main(args): train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) - train_exe = fluid.ParallelExecutor( - use_cuda=args.use_cuda, - loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy = build_strategy, - main_program=train_program) + train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy) train_pyreader.decorate_tensor_provider(train_data_generator) - else: - train_exe = None - if args.do_val or args.do_test: - test_exe = fluid.ParallelExecutor( - use_cuda=args.use_cuda, - main_program=test_prog, - share_vars_from=train_exe) if args.do_train: train_pyreader.start() @@ -317,7 +300,7 @@ def main(args): else: fetch_list = [] - outputs = train_exe.run(fetch_list=fetch_list) + outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: diff --git a/BERT/run_squad.py b/BERT/run_squad.py index 3d4a23f913a07a46c5b332cf23bd3cd3e97f18df..514b815878cc9862c2b1fa98f69bacebfc176fc0 100644 --- a/BERT/run_squad.py +++ b/BERT/run_squad.py @@ -279,7 +279,6 @@ def train(args): use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) - fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name]) if args.verbose: if args.in_tokens: @@ -301,8 +300,6 @@ def train(args): bert_config=bert_config, is_training=False) - fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name, - start_logits.name, end_logits.name, num_seqs.name]) test_prog = test_prog.clone(for_test=True) @@ -341,11 +338,8 @@ def train(args): exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope - train_exe = fluid.ParallelExecutor( - use_cuda=args.use_cuda, - loss_name=loss.name, - exec_strategy=exec_strategy, - main_program=train_program) + train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel( + loss_name=loss.name, exec_strategy=exec_strategy) train_pyreader.decorate_tensor_provider(train_data_generator) @@ -366,7 +360,7 @@ def train(args): else: fetch_list = [] - outputs = train_exe.run(fetch_list=fetch_list) + outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: