提交 d9db94d7 编写于 作者: J Jie Fang 提交者: gongweibao

Optimize amp for multi-gpu to enable FP16 gradients transfer across gpus. (#19714)

Optimize amp for multi-gpu to enable FP16 gradients transfer across gpus
上级 47af618f
...@@ -508,7 +508,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca ...@@ -508,7 +508,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca
paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4')) paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4'))
paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, False)), ('document', '5f118631fc8632afb981b3a26daae731')) paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, True)), ('document', '5f118631fc8632afb981b3a26daae731'))
paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40'))
paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270')) paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270'))
......
...@@ -18,6 +18,7 @@ from ... import layers ...@@ -18,6 +18,7 @@ from ... import layers
from ... import unique_name from ... import unique_name
from . import fp16_utils from . import fp16_utils
from .fp16_utils import update_loss_scaling, rewrite_program from .fp16_utils import update_loss_scaling, rewrite_program
from .fp16_utils import update_role_var_grad
from .fp16_lists import AutoMixedPrecisionLists from .fp16_lists import AutoMixedPrecisionLists
__all__ = ["decorate"] __all__ = ["decorate"]
...@@ -124,15 +125,17 @@ class OptimizerWithMixedPrecison(object): ...@@ -124,15 +125,17 @@ class OptimizerWithMixedPrecison(object):
""" """
rewrite_program(self._train_program, self._amp_lists) rewrite_program(self._train_program, self._amp_lists)
scaled_loss = loss * self._loss_scaling scaled_loss = loss * self._loss_scaling
self._param_grads = self._optimizer.backward( self._params_grads = self._optimizer.backward(
scaled_loss, startup_program, parameter_list, no_grad_set, scaled_loss, startup_program, parameter_list, no_grad_set,
callbacks) callbacks)
scaled_params_grad = [] update_role_var_grad(self._train_program, self._params_grads)
for p, g in self._param_grads: scaled_params_grads = []
scaled_g = g / self._loss_scaling for p, g in self._params_grads:
scaled_params_grad.append([p, scaled_g]) with self._train_program._optimized_guard([p, g]):
scaled_g = g / self._loss_scaling
scaled_params_grads.append([p, scaled_g])
return scaled_params_grad, scaled_loss return scaled_params_grads, scaled_loss
def apply_gradients(self, scaled_params_grads): def apply_gradients(self, scaled_params_grads):
""" """
...@@ -209,7 +212,7 @@ def decorate(optimizer, ...@@ -209,7 +212,7 @@ def decorate(optimizer,
decr_every_n_nan_or_inf=2, decr_every_n_nan_or_inf=2,
incr_ratio=2.0, incr_ratio=2.0,
decr_ratio=0.8, decr_ratio=0.8,
use_dynamic_loss_scaling=False): use_dynamic_loss_scaling=True):
""" """
Decorate the given optimizer to adapt to the mixed-precision training. Decorate the given optimizer to adapt to the mixed-precision training.
......
...@@ -41,15 +41,24 @@ class AutoMixedPrecisionLists(object): ...@@ -41,15 +41,24 @@ class AutoMixedPrecisionLists(object):
""" """
Update black and white list according to users' custom list. Update black and white list according to users' custom list.
""" """
if self._custom_white_list and self._custom_black_list:
for op_name in self._custom_white_list:
if op_name in self._custom_black_list:
raise ValueError("Custom white list overlap "
"custom black list")
if self._custom_white_list: if self._custom_white_list:
for op_name in self._custom_white_list: for op_name in self._custom_white_list:
if op_name in self.black_list: if op_name in self.black_list:
self.black_list.remove(op_name) self.black_list.remove(op_name)
elif op_name in self.gray_list:
self.gray_list.remove(op_name)
self.white_list.add(op_name) self.white_list.add(op_name)
if self._custom_black_list: if self._custom_black_list:
for op_name in self._custom_black_list: for op_name in self._custom_black_list:
if op_name in self.white_list: if op_name in self.white_list:
self.white_list.remove(op_name) self.white_list.remove(op_name)
elif op_name in self.gray_list:
self.gray_list.remove(op_name)
self.black_list.add(op_name) self.black_list.add(op_name)
......
...@@ -129,12 +129,30 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): ...@@ -129,12 +129,30 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
return num_cast_ops return num_cast_ops
def find_true_prev_op(ops, var_name): def find_true_prev_op(ops, cur_op, var_name):
"""
Find the true prev op that outputs var_name variable.
Args:
ops (list): A list of ops.
cur_op (Operator): Current operator which has var_name variable.
var_name (string): Variable name.
"""
prev_op = []
for op in ops: for op in ops:
if op == cur_op:
break
for out_name in op.output_names: for out_name in op.output_names:
for out_var_name in op.output(out_name): for out_var_name in op.output(out_name):
if out_var_name == var_name: if out_var_name == var_name:
return op prev_op.append(op)
if prev_op:
if not len(prev_op) == 1:
raise ValueError("There must be only one previous op "
"that outputs {0} variable".format(var_name))
else:
return prev_op[0]
return None
def rewrite_program(main_prog, amp_lists): def rewrite_program(main_prog, amp_lists):
...@@ -161,8 +179,7 @@ def rewrite_program(main_prog, amp_lists): ...@@ -161,8 +179,7 @@ def rewrite_program(main_prog, amp_lists):
ops = block.ops ops = block.ops
white_op_set = set() white_op_set = set()
black_op_set = set() black_op_set = set()
for i in range(len(ops)): for op in ops:
op = ops[i]
if op.type in amp_lists.black_list: if op.type in amp_lists.black_list:
black_op_set.add(op) black_op_set.add(op)
elif op.type in amp_lists.white_list: elif op.type in amp_lists.white_list:
...@@ -178,15 +195,17 @@ def rewrite_program(main_prog, amp_lists): ...@@ -178,15 +195,17 @@ def rewrite_program(main_prog, amp_lists):
# this in_var isn't the output of other op # this in_var isn't the output of other op
if in_var.op is None: if in_var.op is None:
continue continue
if in_var.op is op: elif in_var.op is op:
prev_op = find_true_prev_op(ops, in_var_name) prev_op = find_true_prev_op(ops, op, in_var_name)
if prev_op is None:
continue
else: else:
prev_op = in_var.op prev_op = in_var.op
# if it's one of inputs # if it's one of inputs
if prev_op in black_op_set or \ if prev_op in black_op_set or \
prev_op.type in amp_lists.black_list: prev_op.type in amp_lists.black_list:
is_black_op = True is_black_op = True
if prev_op in white_op_set or \ elif prev_op in white_op_set or \
prev_op.type in amp_lists.white_list: prev_op.type in amp_lists.white_list:
is_white_op = True is_white_op = True
if is_black_op: if is_black_op:
...@@ -218,6 +237,45 @@ def rewrite_program(main_prog, amp_lists): ...@@ -218,6 +237,45 @@ def rewrite_program(main_prog, amp_lists):
idx += num_cast_ops + 1 idx += num_cast_ops + 1
def update_role_var_grad(main_prog, params_grads):
"""
Update op_role_var attr for some ops to make sure the gradients
transfered across gpus is FP16.
1. Check whether the op that outputs gradient is cast or not.
2. If op is cast and gradient is FP32, remove the op_role_var
and find the prev op which outputs FP16 gradient
3. Update the op_role_var of the prev op.
Args:
main_prog (Program): The main program for training.
params_grads (list): A list of params and grads.
"""
block = main_prog.global_block()
BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
for p, g in params_grads:
op = g.op
if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
role = op.attr('op_role')
if role & int(BACKWARD) and op.has_attr('op_role_var'):
op.desc.remove_attr("op_role_var")
else:
raise ValueError("The cast op {0} must be in BACKWARD role "
"and have op_role_var attr.".format(op))
fp16_grad_name = op.input(op.input_names[0])[0]
op_for_fp16_grad = find_true_prev_op(block.ops, op, fp16_grad_name)
op_role_var_attr_name = \
core.op_proto_and_checker_maker.kOpRoleVarAttrName()
attr_val = [p.name, fp16_grad_name]
if op_for_fp16_grad.has_attr(op_role_var_attr_name):
attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name))
op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val)
# maximize the allreduce overlap
op._set_attr('op_role', OPTIMIZE)
def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
num_bad_steps, incr_every_n_steps, num_bad_steps, incr_every_n_steps,
decr_every_n_nan_or_inf, incr_ratio, decr_ratio): decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
......
...@@ -22,6 +22,7 @@ import sys ...@@ -22,6 +22,7 @@ import sys
import numpy import numpy
import unittest import unittest
import os import os
import copy
import numpy as np import numpy as np
...@@ -266,6 +267,132 @@ def main(net_type, use_cuda, is_local=True): ...@@ -266,6 +267,132 @@ def main(net_type, use_cuda, is_local=True):
class TestImageClassification(unittest.TestCase): class TestImageClassification(unittest.TestCase):
def test_amp_lists(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists()
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_1(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
# 1. w={'exp}, b=None
white_list.add('exp')
black_list.remove('exp')
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
{'exp'})
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_2(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
# 2. w={'tanh'}, b=None
white_list.add('tanh')
gray_list.remove('tanh')
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
{'tanh'})
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_3(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
# 3. w={'lstm'}, b=None
white_list.add('lstm')
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
{'lstm'})
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_4(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
# 4. w=None, b={'conv2d'}
white_list.remove('conv2d')
black_list.add('conv2d')
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
custom_black_list={'conv2d'})
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_5(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
# 5. w=None, b={'tanh'}
black_list.add('tanh')
gray_list.remove('tanh')
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
custom_black_list={'tanh'})
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_6(self):
white_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.white_list)
black_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.black_list)
gray_list = copy.copy(
fluid.contrib.mixed_precision.fp16_lists.gray_list)
# 6. w=None, b={'lstm'}
black_list.add('lstm')
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
custom_black_list={'lstm'})
self.assertEqual(amp_lists.white_list, white_list)
self.assertEqual(amp_lists.black_list, black_list)
self.assertEqual(amp_lists.gray_list, gray_list)
def test_amp_lists_7(self):
# 7. w={'lstm'} b={'lstm'}
# raise ValueError
self.assertRaises(ValueError,
fluid.contrib.mixed_precision.AutoMixedPrecisionLists,
{'lstm'}, {'lstm'})
def test_vgg_cuda(self): def test_vgg_cuda(self):
with self.scope_prog_guard(): with self.scope_prog_guard():
main('vgg', use_cuda=True) main('vgg', use_cuda=True)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册