提交 70db68a5 编写于 作者: L liuyibing01

Use gc & compiled program for fine-tuning

上级 87d3c630
...@@ -122,8 +122,8 @@ export current_endpoint=192.168.0.17:9185 ...@@ -122,8 +122,8 @@ export current_endpoint=192.168.0.17:9185
对于 [GLUE 数据](https://gluebenchmark.com/tasks),请运行这个[脚本](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)予以下载; 对于 XNLI 任务,则需分别下载 [XNLI dev/test set](https://bert-data.bj.bcebos.com/XNLI-1.0.zip)[XNLI machine-translated training set](https://bert-data.bj.bcebos.com/XNLI-MT-1.0.zip),然后解压到同一个目录。以 XNLI 任务为例,启动 Fine-tuning 的方式如下: 对于 [GLUE 数据](https://gluebenchmark.com/tasks),请运行这个[脚本](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)予以下载; 对于 XNLI 任务,则需分别下载 [XNLI dev/test set](https://bert-data.bj.bcebos.com/XNLI-1.0.zip)[XNLI machine-translated training set](https://bert-data.bj.bcebos.com/XNLI-MT-1.0.zip),然后解压到同一个目录。以 XNLI 任务为例,启动 Fine-tuning 的方式如下:
```shell ```shell
export FLAGS_enable_parallel_graph=1 export FLAGS_sync_nccl_allreduce=0
export FLAGS_sync_nccl_allreduce=1 export FLAGS_eager_delete_tensor_gb=1
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
BERT_BASE_PATH="chinese_L-12_H-768_A-12" BERT_BASE_PATH="chinese_L-12_H-768_A-12"
...@@ -183,8 +183,8 @@ SQuAD v1.1 ...@@ -183,8 +183,8 @@ SQuAD v1.1
对于 SQuAD v1.1, 按如下方式启动 Fine-tuning: 对于 SQuAD v1.1, 按如下方式启动 Fine-tuning:
```shell ```shell
export FLAGS_enable_parallel_graph=1 export FLAGS_sync_nccl_allreduce=0
export FLAGS_sync_nccl_allreduce=1 export FLAGS_eager_delete_tensor_gb=1
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
BERT_BASE_PATH="uncased_L-12_H-768_A-12" BERT_BASE_PATH="uncased_L-12_H-768_A-12"
...@@ -229,6 +229,8 @@ python ${SQUAD_PATH}/evaluate-v1.1.py ${SQUAD_PATH}/dev-v1.1.json ${CHECKPOINT_P ...@@ -229,6 +229,8 @@ python ${SQUAD_PATH}/evaluate-v1.1.py ${SQUAD_PATH}/dev-v1.1.json ${CHECKPOINT_P
对于 SQuAD v2.0, 按如下方式启动 Fine-tuning: 对于 SQuAD v2.0, 按如下方式启动 Fine-tuning:
```shell ```shell
export FLAGS_sync_nccl_allreduce=0
export FLAGS_eager_delete_tensor_gb=1
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
BERT_BASE_PATH="uncased_L-12_H-768_A-12" BERT_BASE_PATH="uncased_L-12_H-768_A-12"
CHECKPOINT_PATH=/path/to/save/checkpoints/ CHECKPOINT_PATH=/path/to/save/checkpoints/
......
...@@ -208,12 +208,6 @@ def main(args): ...@@ -208,12 +208,6 @@ def main(args):
use_fp16=args.use_fp16, use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling) loss_scaling=args.loss_scaling)
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[
loss.name, probs.name, accuracy.name, num_seqs.name
])
if args.verbose: if args.verbose:
if args.in_tokens: if args.in_tokens:
lower_mem, upper_mem, unit = fluid.contrib.memory_usage( lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
...@@ -279,22 +273,11 @@ def main(args): ...@@ -279,22 +273,11 @@ def main(args):
train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator = fluid.contrib.reader.distributed_batch_reader(
train_data_generator) train_data_generator)
train_exe = fluid.ParallelExecutor( train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
use_cuda=args.use_cuda, loss_name=loss.name, build_strategy=build_strategy)
loss_name=loss.name,
exec_strategy=exec_strategy,
build_strategy = build_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.decorate_tensor_provider(train_data_generator)
else:
train_exe = None
if args.do_val or args.do_test:
test_exe = fluid.ParallelExecutor(
use_cuda=args.use_cuda,
main_program=test_prog,
share_vars_from=train_exe)
if args.do_train: if args.do_train:
train_pyreader.start() train_pyreader.start()
...@@ -317,7 +300,7 @@ def main(args): ...@@ -317,7 +300,7 @@ def main(args):
else: else:
fetch_list = [] fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list) outputs = exe.run(train_compiled_program, fetch_list=fetch_list)
if steps % args.skip_steps == 0: if steps % args.skip_steps == 0:
if warmup_steps <= 0: if warmup_steps <= 0:
......
...@@ -279,7 +279,6 @@ def train(args): ...@@ -279,7 +279,6 @@ def train(args):
use_fp16=args.use_fp16, use_fp16=args.use_fp16,
loss_scaling=args.loss_scaling) loss_scaling=args.loss_scaling)
fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])
if args.verbose: if args.verbose:
if args.in_tokens: if args.in_tokens:
...@@ -301,8 +300,6 @@ def train(args): ...@@ -301,8 +300,6 @@ def train(args):
bert_config=bert_config, bert_config=bert_config,
is_training=False) is_training=False)
fluid.memory_optimize(test_prog, skip_opt_set=[unique_ids.name,
start_logits.name, end_logits.name, num_seqs.name])
test_prog = test_prog.clone(for_test=True) test_prog = test_prog.clone(for_test=True)
...@@ -341,11 +338,8 @@ def train(args): ...@@ -341,11 +338,8 @@ def train(args):
exec_strategy.num_threads = dev_count exec_strategy.num_threads = dev_count
exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope
train_exe = fluid.ParallelExecutor( train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy)
loss_name=loss.name,
exec_strategy=exec_strategy,
main_program=train_program)
train_pyreader.decorate_tensor_provider(train_data_generator) train_pyreader.decorate_tensor_provider(train_data_generator)
...@@ -366,7 +360,7 @@ def train(args): ...@@ -366,7 +360,7 @@ def train(args):
else: else:
fetch_list = [] fetch_list = []
outputs = train_exe.run(fetch_list=fetch_list) outputs = exe.run(train_compiled_program, fetch_list=fetch_list)
if steps % args.skip_steps == 0: if steps % args.skip_steps == 0:
if warmup_steps <= 0: if warmup_steps <= 0:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册