diff --git a/BERT/dist_utils.py b/BERT/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..94cf415d0da5bba9ab68c7a7503b55fe68c095c4 --- /dev/null +++ b/BERT/dist_utils.py @@ -0,0 +1,45 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import paddle.fluid as fluid + +def nccl2_prepare(trainer_id, startup_prog, main_prog): + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, + trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), + current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), + startup_program=startup_prog, + program=main_prog) + +def prepare_for_multi_process(exe, build_strategy, train_prog, startup_prog): + # prepare for multi-process + trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0)) + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + print("PADDLE_TRAINERS_NUM", num_trainers) + print("PADDLE_TRAINER_ID", trainer_id) + build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id + # NOTE(zcd): use multi processes to train the model, + # and each process use one GPU card. + if num_trainers > 1: + nccl2_prepare(trainer_id, + startup_prog, train_prog) + # the startup_prog are run two times, but it doesn't matter. + exe.run(startup_prog) diff --git a/BERT/run_classifier.py b/BERT/run_classifier.py index 948ec3bc9eaf60f5c9fbcaad5b2cb856f8261133..f2ca68c4728b9c297c1ba25024dba51ab6f7acc5 100644 --- a/BERT/run_classifier.py +++ b/BERT/run_classifier.py @@ -32,6 +32,8 @@ from model.classifier import create_model from optimization import optimization from utils.args import ArgumentGroup, print_arguments from utils.init import init_pretraining_params, init_checkpoint +import dist_utils + # yapf: disable parser = argparse.ArgumentParser(__doc__) @@ -107,6 +109,21 @@ def evaluate(exe, test_program, test_pyreader, fetch_list, eval_phase): (eval_phase, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), time_end - time_begin)) +def get_device_num(): + visible_device = os.getenv('CUDA_VISIBLE_DEVICES') + # NOTE(zcd): use multi processes to train the model, + # and each process use one GPU card. + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + if num_trainers > 1 : return 1 + if visible_device: + device_num = len(visible_device.split(',')) + else: + device_num = subprocess.check_output(['nvidia-smi','-L']).decode().count('\n') + return device_num + +def update_lr(args): + num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) + args.learning_rate = args.learning_rate / num_trainers def main(args): bert_config = BertConfig(args.bert_config_path) @@ -114,12 +131,14 @@ def main(args): if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) - dev_count = fluid.core.get_cuda_device_count() + dev_count = get_device_num() # fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) + update_lr(args) + task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, @@ -250,7 +269,9 @@ def main(args): exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() - + + dist_utils.prepare_for_multi_process(exe, build_strategy, train_program, startup_prog) + train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name,