From d9db94d75280680630f683b3efd45d5c226a9792 Mon Sep 17 00:00:00 2001 From: Jie Fang Date: Thu, 19 Sep 2019 09:26:35 +0800 Subject: [PATCH] Optimize amp for multi-gpu to enable FP16 gradients transfer across gpus. (#19714) Optimize amp for multi-gpu to enable FP16 gradients transfer across gpus --- paddle/fluid/API.spec | 2 +- .../contrib/mixed_precision/decorator.py | 17 ++- .../contrib/mixed_precision/fp16_lists.py | 9 ++ .../contrib/mixed_precision/fp16_utils.py | 72 +++++++++- .../tests/test_image_classification_fp16.py | 127 ++++++++++++++++++ 5 files changed, 212 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index f84a87ab825..9cac2908ba4 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -508,7 +508,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4')) -paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, False)), ('document', '5f118631fc8632afb981b3a26daae731')) +paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, True)), ('document', '5f118631fc8632afb981b3a26daae731')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270')) diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index 83a75699836..4ca4a8972c7 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -18,6 +18,7 @@ from ... import layers from ... import unique_name from . import fp16_utils from .fp16_utils import update_loss_scaling, rewrite_program +from .fp16_utils import update_role_var_grad from .fp16_lists import AutoMixedPrecisionLists __all__ = ["decorate"] @@ -124,15 +125,17 @@ class OptimizerWithMixedPrecison(object): """ rewrite_program(self._train_program, self._amp_lists) scaled_loss = loss * self._loss_scaling - self._param_grads = self._optimizer.backward( + self._params_grads = self._optimizer.backward( scaled_loss, startup_program, parameter_list, no_grad_set, callbacks) - scaled_params_grad = [] - for p, g in self._param_grads: - scaled_g = g / self._loss_scaling - scaled_params_grad.append([p, scaled_g]) + update_role_var_grad(self._train_program, self._params_grads) + scaled_params_grads = [] + for p, g in self._params_grads: + with self._train_program._optimized_guard([p, g]): + scaled_g = g / self._loss_scaling + scaled_params_grads.append([p, scaled_g]) - return scaled_params_grad, scaled_loss + return scaled_params_grads, scaled_loss def apply_gradients(self, scaled_params_grads): """ @@ -209,7 +212,7 @@ def decorate(optimizer, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8, - use_dynamic_loss_scaling=False): + use_dynamic_loss_scaling=True): """ Decorate the given optimizer to adapt to the mixed-precision training. diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 44a2497045d..75f90cabfff 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -41,15 +41,24 @@ class AutoMixedPrecisionLists(object): """ Update black and white list according to users' custom list. """ + if self._custom_white_list and self._custom_black_list: + for op_name in self._custom_white_list: + if op_name in self._custom_black_list: + raise ValueError("Custom white list overlap " + "custom black list") if self._custom_white_list: for op_name in self._custom_white_list: if op_name in self.black_list: self.black_list.remove(op_name) + elif op_name in self.gray_list: + self.gray_list.remove(op_name) self.white_list.add(op_name) if self._custom_black_list: for op_name in self._custom_black_list: if op_name in self.white_list: self.white_list.remove(op_name) + elif op_name in self.gray_list: + self.gray_list.remove(op_name) self.black_list.add(op_name) diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 52fd2ba9ca1..05dfe273035 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -129,12 +129,30 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): return num_cast_ops -def find_true_prev_op(ops, var_name): +def find_true_prev_op(ops, cur_op, var_name): + """ + Find the true prev op that outputs var_name variable. + + Args: + ops (list): A list of ops. + cur_op (Operator): Current operator which has var_name variable. + var_name (string): Variable name. + """ + prev_op = [] for op in ops: + if op == cur_op: + break for out_name in op.output_names: for out_var_name in op.output(out_name): if out_var_name == var_name: - return op + prev_op.append(op) + if prev_op: + if not len(prev_op) == 1: + raise ValueError("There must be only one previous op " + "that outputs {0} variable".format(var_name)) + else: + return prev_op[0] + return None def rewrite_program(main_prog, amp_lists): @@ -161,8 +179,7 @@ def rewrite_program(main_prog, amp_lists): ops = block.ops white_op_set = set() black_op_set = set() - for i in range(len(ops)): - op = ops[i] + for op in ops: if op.type in amp_lists.black_list: black_op_set.add(op) elif op.type in amp_lists.white_list: @@ -178,15 +195,17 @@ def rewrite_program(main_prog, amp_lists): # this in_var isn't the output of other op if in_var.op is None: continue - if in_var.op is op: - prev_op = find_true_prev_op(ops, in_var_name) + elif in_var.op is op: + prev_op = find_true_prev_op(ops, op, in_var_name) + if prev_op is None: + continue else: prev_op = in_var.op # if it's one of inputs if prev_op in black_op_set or \ prev_op.type in amp_lists.black_list: is_black_op = True - if prev_op in white_op_set or \ + elif prev_op in white_op_set or \ prev_op.type in amp_lists.white_list: is_white_op = True if is_black_op: @@ -218,6 +237,45 @@ def rewrite_program(main_prog, amp_lists): idx += num_cast_ops + 1 +def update_role_var_grad(main_prog, params_grads): + """ + Update op_role_var attr for some ops to make sure the gradients + transfered across gpus is FP16. + 1. Check whether the op that outputs gradient is cast or not. + 2. If op is cast and gradient is FP32, remove the op_role_var + and find the prev op which outputs FP16 gradient + 3. Update the op_role_var of the prev op. + + Args: + main_prog (Program): The main program for training. + params_grads (list): A list of params and grads. + """ + block = main_prog.global_block() + BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward + OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize + for p, g in params_grads: + op = g.op + if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast': + role = op.attr('op_role') + if role & int(BACKWARD) and op.has_attr('op_role_var'): + op.desc.remove_attr("op_role_var") + else: + raise ValueError("The cast op {0} must be in BACKWARD role " + "and have op_role_var attr.".format(op)) + + fp16_grad_name = op.input(op.input_names[0])[0] + op_for_fp16_grad = find_true_prev_op(block.ops, op, fp16_grad_name) + op_role_var_attr_name = \ + core.op_proto_and_checker_maker.kOpRoleVarAttrName() + attr_val = [p.name, fp16_grad_name] + if op_for_fp16_grad.has_attr(op_role_var_attr_name): + attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name)) + op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val) + + # maximize the allreduce overlap + op._set_attr('op_role', OPTIMIZE) + + def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, num_bad_steps, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio): diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py index 982e380c7e7..e556f4d07d1 100644 --- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py +++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py @@ -22,6 +22,7 @@ import sys import numpy import unittest import os +import copy import numpy as np @@ -266,6 +267,132 @@ def main(net_type, use_cuda, is_local=True): class TestImageClassification(unittest.TestCase): + def test_amp_lists(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists() + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_1(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + # 1. w={'exp}, b=None + white_list.add('exp') + black_list.remove('exp') + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + {'exp'}) + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_2(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + # 2. w={'tanh'}, b=None + white_list.add('tanh') + gray_list.remove('tanh') + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + {'tanh'}) + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_3(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + # 3. w={'lstm'}, b=None + white_list.add('lstm') + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + {'lstm'}) + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_4(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + # 4. w=None, b={'conv2d'} + white_list.remove('conv2d') + black_list.add('conv2d') + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + custom_black_list={'conv2d'}) + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_5(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + # 5. w=None, b={'tanh'} + black_list.add('tanh') + gray_list.remove('tanh') + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + custom_black_list={'tanh'}) + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_6(self): + white_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.white_list) + black_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.black_list) + gray_list = copy.copy( + fluid.contrib.mixed_precision.fp16_lists.gray_list) + + # 6. w=None, b={'lstm'} + black_list.add('lstm') + + amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + custom_black_list={'lstm'}) + self.assertEqual(amp_lists.white_list, white_list) + self.assertEqual(amp_lists.black_list, black_list) + self.assertEqual(amp_lists.gray_list, gray_list) + + def test_amp_lists_7(self): + # 7. w={'lstm'} b={'lstm'} + # raise ValueError + self.assertRaises(ValueError, + fluid.contrib.mixed_precision.AutoMixedPrecisionLists, + {'lstm'}, {'lstm'}) + def test_vgg_cuda(self): with self.scope_prog_guard(): main('vgg', use_cuda=True) -- GitLab