diff --git a/PaddleNLP/benchmark/transformer/configs/transformer.big.yaml b/PaddleNLP/benchmark/transformer/configs/transformer.big.yaml index cd0719869a16434006442b0ed30068476f251bc4..27b26606bc3060f3698b08a13eb55facf6a9cbb2 100644 --- a/PaddleNLP/benchmark/transformer/configs/transformer.big.yaml +++ b/PaddleNLP/benchmark/transformer/configs/transformer.big.yaml @@ -96,4 +96,11 @@ dropout: 0.1 # Vocabularies in source and target should be same for weight sharing. weight_sharing: True +# Use amp or not +use_amp: False +scale_loss: 1.0 + +# Whether to use multi-card/multi-node distributed training. +is_distributed: True + max_iter: None diff --git a/PaddleNLP/benchmark/transformer/static/run_pretrain.sh b/PaddleNLP/benchmark/transformer/static/run_pretrain.sh new file mode 100644 index 0000000000000000000000000000000000000000..8136b0a2abc5818cdd10f51927a23ae98363e702 --- /dev/null +++ b/PaddleNLP/benchmark/transformer/static/run_pretrain.sh @@ -0,0 +1,4 @@ + +python -m paddle.distributed.launch \ + --gpus="0,1" \ + train.py diff --git a/PaddleNLP/benchmark/transformer/static/train.py b/PaddleNLP/benchmark/transformer/static/train.py index 8736d9062268e21de42ef407d1546a3e728d6f17..84a00af605c68b39fe9bc1ec878463837b974330 100644 --- a/PaddleNLP/benchmark/transformer/static/train.py +++ b/PaddleNLP/benchmark/transformer/static/train.py @@ -10,6 +10,7 @@ from attrdict import AttrDict from pprint import pprint import paddle +import paddle.distributed.fleet as fleet import paddle.distributed as dist from paddlenlp.transformers import TransformerModel, CrossEntropyCriterion @@ -36,8 +37,14 @@ def parse_args(): def do_train(args): paddle.enable_static() - places = paddle.static.cuda_places() if args.use_gpu else paddle.static.cpu_places() - trainer_count = len(places) + if args.is_distributed: + fleet.init(is_collective=True) + gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) + places = paddle.CUDAPlace(gpu_id) if args.use_gpu else paddle.static.cpu_places() + trainer_count = 1 if args.use_gpu else len(places) + else: + places = paddle.static.cuda_places() if args.use_gpu else paddle.static.cpu_places() + trainer_count = len(places) # Set seed for CE random_seed = eval(str(args.random_seed)) @@ -88,19 +95,38 @@ def do_train(args): epsilon=float(args.eps), parameters=transformer.parameters()) + if args.is_distributed: + build_strategy = paddle.static.BuildStrategy() + exec_strategy = paddle.static.ExecutionStrategy() + dist_strategy = fleet.DistributedStrategy() + dist_strategy.build_strategy = build_strategy + dist_strategy.execution_strategy = exec_strategy + dist_strategy.fuse_grad_size_in_MB = 16 + + if args.use_amp: + dist_strategy.amp = True + dist_strategy.amp_configs = { + 'custom_white_list': ['softmax', 'layer_norm', 'gelu'], + 'init_loss_scaling': args.scale_loss, + } + + optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(avg_cost) - exe = paddle.static.Executor() + if args.is_distributed: + exe = paddle.static.Executor(places) + else: + exe = paddle.static.Executor() + build_strategy = paddle.static.BuildStrategy() + exec_strategy = paddle.static.ExecutionStrategy() + + compiled_train_program = paddle.static.CompiledProgram( + train_program).with_data_parallel( + loss_name=avg_cost.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) exe.run(startup_program) - build_strategy = paddle.static.BuildStrategy() - exec_strategy = paddle.static.ExecutionStrategy() - - compiled_train_program = paddle.static.CompiledProgram( - train_program).with_data_parallel( - loss_name=avg_cost.name, - build_strategy=build_strategy, - exec_strategy=exec_strategy) # the best cross-entropy value with label smoothing loss_normalizer = -( @@ -127,13 +153,22 @@ def do_train(args): data = [data] train_reader_cost = time.time() - batch_start - outs = exe.run(compiled_train_program, - feed=[{ - 'src_word': data[i][0], - 'trg_word': data[i][1], - 'lbl_word': data[i][2], - } for i in range(trainer_count)], - fetch_list=[sum_cost.name, token_num.name]) + if args.is_distributed: + outs = exe.run(train_program, + feed=[{ + 'src_word': data[i][0], + 'trg_word': data[i][1], + 'lbl_word': data[i][2], + } for i in range(trainer_count)], + fetch_list=[sum_cost.name, token_num.name]) + else: + outs = exe.run(compiled_train_program, + feed=[{ + 'src_word': data[i][0], + 'trg_word': data[i][1], + 'lbl_word': data[i][2], + } for i in range(trainer_count)], + fetch_list=[sum_cost.name, token_num.name]) scheduler.step() train_batch_cost = time.time() - batch_start @@ -176,7 +211,7 @@ def do_train(args): batch_ips_avg.reset() if step_idx % args.save_step == 0 and step_idx != 0: - if args.save_model: + if args.save_model and dist.get_rank() == 0: model_path = os.path.join( args.save_model, "step_" + str(step_idx), "transformer") paddle.static.save(train_program, model_path)