From c265768d9e7ad6bd44ed78633079ff309a6b6f4e Mon Sep 17 00:00:00 2001 From: wawltor Date: Tue, 8 Dec 2020 22:21:12 +0800 Subject: [PATCH] Add the support of amp for the fleet train (#4996) * add the support of amp for the fleet train * remove the attribute for the amp --- PaddleNLP/benchmark/bert/run_pretrain.py | 76 +++++++++++++++---- .../benchmark/bert/run_pretrain_single.py | 11 +-- 2 files changed, 63 insertions(+), 24 deletions(-) diff --git a/PaddleNLP/benchmark/bert/run_pretrain.py b/PaddleNLP/benchmark/bert/run_pretrain.py index 45e88381..ebbe4e2f 100644 --- a/PaddleNLP/benchmark/bert/run_pretrain.py +++ b/PaddleNLP/benchmark/bert/run_pretrain.py @@ -13,6 +13,8 @@ # limitations under the License. import argparse +import collections +import itertools import os import random import time @@ -21,6 +23,7 @@ from functools import partial from concurrent.futures import ThreadPoolExecutor import numpy as np +import distutils.util import paddle import paddle.distributed.fleet as fleet @@ -117,6 +120,22 @@ def parse_args(): help="Save checkpoint every X updates steps.") parser.add_argument( "--seed", type=int, default=42, help="Random seed for initialization") + parser.add_argument( + "--use_amp", + type=distutils.util.strtobool, + default=False, + help="Enable mixed precision training.") + parser.add_argument( + "--enable_addto", + type=distutils.util.strtobool, + default=False, + help="Whether to enable the addto strategy for gradient accumulation or not. This is only used for AMP training." + ) + parser.add_argument( + "--scale_loss", + type=float, + default=1.0, + help="The value of scale_loss for fp16.") args = parser.parse_args() return args @@ -149,6 +168,26 @@ def reset_program_state_dict(model, state_dict): return new_state_dict +def build_compiled_program(main_program, loss): + exec_strategy = paddle.static.ExecutionStrategy() + exec_strategy.num_threads = 1 + exec_strategy.num_iteration_per_drop_scope = 10000 + build_strategy = paddle.static.BuildStrategy() + build_strategy.enable_addto = args.enable_addto + main_program = paddle.static.CompiledProgram( + main_program).with_data_parallel( + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + return main_program + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + class WorkerInitObj(object): def __init__(self, seed): self.seed = seed @@ -158,12 +197,6 @@ class WorkerInitObj(object): random.seed(self.seed + id) -def set_seed(seed): - random.seed(seed) - np.random.seed(seed) - paddle.seed(seed) - - def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() @@ -175,6 +208,8 @@ def do_train(args): worker_init = WorkerInitObj(args.seed + fleet.worker_index()) # Define the input data in the static mode + main_program = paddle.static.default_main_program() + startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ @@ -186,9 +221,10 @@ def do_train(args): args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) - model = BertForPretraining( - BertModel(**model_class.pretrained_init_configuration[ - args.model_name_or_path])) + config = model_class.pretrained_init_configuration[args.model_name_or_path] + if config["vocab_size"] % 8 != 0: + config["vocab_size"] += 8 - (config["vocab_size"] % 8) + model = BertForPretraining(BertModel(**config)) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, @@ -219,7 +255,14 @@ def do_train(args): p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) - + if args.use_amp: + amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + custom_white_list=['softmax']) + optimizer = paddle.fluid.contrib.mixed_precision.decorate( + optimizer, + amp_list, + init_loss_scaling=args.scale_loss, + use_dynamic_loss_scaling=True) # Use the fleet api to compile the distributed optimizer strategy = fleet.DistributedStrategy() optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) @@ -227,13 +270,14 @@ def do_train(args): # Define the Executor for running the static model exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) + exe.run(startup_program) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) - paddle.static.set_program_state(paddle.static.default_main_program(), - reset_state_dict) + paddle.static.set_program_state(main_program, reset_state_dict) + # Construct the compiled program + main_program = build_compiled_program(main_program, loss) pool = ThreadPoolExecutor(1) global_step = 0 @@ -269,9 +313,9 @@ def do_train(args): for step, batch in enumerate(train_data_loader): global_step += 1 - loss_return = exe.run(paddle.static.default_main_program(),\ - feed=batch, - fetch_list=[loss]) + loss_return = exe.run(main_program, + feed=batch, + fetch_list=[loss]) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: diff --git a/PaddleNLP/benchmark/bert/run_pretrain_single.py b/PaddleNLP/benchmark/bert/run_pretrain_single.py index 81a3524c..74a57a91 100644 --- a/PaddleNLP/benchmark/bert/run_pretrain_single.py +++ b/PaddleNLP/benchmark/bert/run_pretrain_single.py @@ -132,16 +132,11 @@ def parse_args(): type=float, default=1.0, help="The value of scale_loss for fp16.") - parser.add_argument( - "--use_dynamic_loss_scaling", - type=distutils.util.strtobool, - default=True, - help="Whether to use dynamic loss scaling.") args = parser.parse_args() return args -def construct_compiled_program(main_program, loss): +def build_compiled_program(main_program, loss): exec_strategy = paddle.static.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.num_iteration_per_drop_scope = 10000 @@ -238,7 +233,7 @@ def do_train(args): optimizer, amp_list, init_loss_scaling=args.scale_loss, - use_dynamic_loss_scaling=args.use_dynamic_loss_scaling) + use_dynamic_loss_scaling=True) optimizer.minimize(loss) # Define the Executor for running the static model @@ -250,7 +245,7 @@ def do_train(args): reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(main_program, reset_state_dict) # Construct the compiled program - main_program = construct_compiled_program(main_program, loss) + main_program = build_compiled_program(main_program, loss) global_step = 0 tic_train = time.time() epoch = 0 -- GitLab