diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index c7e550f7aa11752c4f544cc0f4403a20872135bc..3af9457142cf655f2aecbef3df945e1b60204244 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -283,6 +283,8 @@ class AdamW(Optimizer): self._auxiliary_vars = {} self._already_create_accumulater = set() + self._create_master_grad_states() + def _set_auxiliary_var(self, key, val): self._auxiliary_vars[key] = val diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index cc014e7cf1a8fd228be6ca220331f3c36b9adf7d..df20134e923e2f6e7fd994336f6e62fdec8f2895 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -275,6 +275,14 @@ class Optimizer: self._auxiliary_vars = {} self._already_create_accumulater = set() + # create master gradients' states + self._create_master_grad_states() + + def _create_master_grad_states(self): + # master gradients states + self._master_grads = {} + self._master_grad = False + def _set_auxiliary_var(self, key, val): self._auxiliary_vars[key] = val @@ -669,6 +677,25 @@ class Optimizer: self._master_weights[param.name] = var return var + def _create_master_grad(self, grad): + assert self._is_dtype_fp16_or_bf16(grad.dtype) + if grad.name in self._master_grads: + var = self._master_grads[grad.name] + else: + var_name = grad.name + "_fp32_master" + var_name = unique_name.generate(var_name) + var = grad.block.create_var( + name=var_name, + shape=grad.shape, + value=0, + dtype='float32', + lod_level=grad.lod_level, + persistable=grad.persistable, + is_data=grad.is_data, + ) + self._master_grads[grad.name] = var + return var + def _create_accumulators(self, block, parameters): """Create all accumulators needed by the parameters @@ -1168,7 +1195,6 @@ class Optimizer: if self._grad_clip is not None: params_grads = self._grad_clip(params_grads) else: - params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads) # Add regularization if any diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index dca2a4e024c35e7eeadd5f6da93fba87e5ad74e3..06169c668b8e8a0f6a9d7dd7bac7c0de28ea3e05 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -80,6 +80,7 @@ class OptimizerWithMixedPrecision: the loss scaling. use_amp_guard(bool): Whether to use `fp16_guard` when constructing the program. Default None, which means that its value is equal to `use_pure_fp16`. + use_master_grad(bool): Whether to use fp32 master gradients during optimizer. Default is False. use_promote(bool): Whether to promotes to fp32 when op has any float32 inputs. Default is False. """ @@ -96,6 +97,7 @@ class OptimizerWithMixedPrecision: incr_ratio, decr_ratio, use_amp_guard=None, + use_master_grad=False, use_promote=False, ): self._optimizer = optimizer @@ -104,6 +106,7 @@ class OptimizerWithMixedPrecision: self._train_program = None self._is_distributed = False + self._use_master_grad = False self._scaled_loss = None self._loss_scaling = None self._init_loss_scaling = init_loss_scaling @@ -122,6 +125,9 @@ class OptimizerWithMixedPrecision: self._learning_rate = optimizer._learning_rate self._learning_rate_map = optimizer._learning_rate_map self._use_pure_fp16 = level == "O2" + if self._use_pure_fp16 and (dtype == "bfloat16" or dtype == "float16"): + self._use_master_grad = use_master_grad + self._optimizer._master_grad = use_master_grad self._amp_level = level self._use_fp16_guard = use_amp_guard self._to_fp16_var_names = None @@ -384,6 +390,51 @@ class OptimizerWithMixedPrecision: use_promote=self.use_promote, ) + def _append_cast_to_master_grad_op(self, param_grads): + """ + Create master gradient vars and add cast gradient to master gradient op in main program + + Args: + param_grads(list(tuple(Tensor, Tensor))): A list of (parameter, gradient) pair to update. + + Returns: + list: A list of (parameter, master_gradient) pair. In the following grad clip step and optimizer step, params can be updated by master gradient. main_prog will also append cast ops before grad clip ops. + + """ + + if not self._use_master_grad: + return param_grads + + global_block = self._train_program.global_block() + target_block = global_block + current_block = self._train_program.current_block() + if current_block.idx != global_block.idx: + target_block = self._train_program.blocks[ + current_block.backward_block_idx + ] + params_master_grads = [] + + assert isinstance(target_block, paddle.fluid.framework.Block) + # create + for p, g in param_grads: + if g.name not in self._optimizer._master_grads.keys(): + if self._optimizer._is_dtype_fp16_or_bf16(g.dtype): + master_g = self._optimizer._create_master_grad(g) + params_master_grads.append((p, master_g)) + target_block.append_op( + type="cast", + inputs={"X": [g]}, + outputs={"Out": [master_g]}, + attrs={ + "in_dtype": g.dtype, + "out_dtype": master_g.dtype, + }, + ) + else: + params_master_grads.append((p, g)) + + return params_master_grads + def apply_gradients(self, params_grads): """ Check scaled gradients to determine whether to update loss scaling and update @@ -400,6 +451,9 @@ class OptimizerWithMixedPrecision: # transferred across GPUs can be FP16. update_role_var_grad(self._train_program, params_grads) + # Create master grad and add cast op into program + params_grads = self._append_cast_to_master_grad_op(params_grads) + # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0, # the model can be optimized. if ( @@ -756,6 +810,7 @@ def decorate( level='O1', dtype='float16', master_weight=None, + master_grad=False, init_loss_scaling=2**15, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, @@ -782,6 +837,9 @@ def decorate( master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None. + master_grad(bool, optinal): For level='O2', whether to use master_grad + during weight updating. If master_grad is False, in O2 level optimizer + will not use master grad. Default is False. init_loss_scaling(float, optional): The initial loss scaling factor. Default is 32768. incr_every_n_steps(int, optional): Increases loss scaling every n @@ -883,6 +941,7 @@ def decorate( decr_ratio=decr_ratio, use_amp_guard=use_amp_guard, use_promote=use_promote, + use_master_grad=master_grad, ) return mp_optimizer diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py index 342a2618258dfcb3a5328b4afc65c003a386b3bd..1e85770251dc3435ee482598f0b4df3683e878cf 100644 --- a/python/paddle/static/amp/fp16_utils.py +++ b/python/paddle/static/amp/fp16_utils.py @@ -478,9 +478,9 @@ def op_need_keep_fp32(op, amp_lists, use_fp16_guard, params_list): need_keep_fp32 = True for in_name in op.input_names: for params in params_list: - if op.input(in_name)[0] == params.name: + if params.name in op.input(in_name): fp16_varname_list_in_fp32_op = ( - fp16_varname_list_in_fp32_op.union(op.input(in_name)) + fp16_varname_list_in_fp32_op.union([params.name]) ) return need_keep_fp32, fp16_varname_list_in_fp32_op diff --git a/test/amp/amp_base_models.py b/test/amp/amp_base_models.py index aabf9e82f7b8471e7d86ab769cb26e8f5d483589..29e8234e9137a9654bedc60e99aaa885c5cee712 100644 --- a/test/amp/amp_base_models.py +++ b/test/amp/amp_base_models.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +import struct import unittest import numpy as np @@ -22,6 +23,34 @@ from paddle import nn from paddle.fluid import core from paddle.fluid.framework import _non_static_mode + +def copy_bits_from_float_to_uint16(f): + return struct.unpack('> 16 + + +def convert_float_to_uint16(in_list): + if in_list.dtype == np.float32: + new_output = [] + for x in np.nditer(in_list): + new_output.append(np.uint16(copy_bits_from_float_to_uint16(x))) + new_output = np.reshape(new_output, in_list.shape).view(np.uint16) + return new_output + else: + return in_list + + +def convert_uint16_to_float(in_list): + if in_list.dtype == np.uint16: + in_list = np.asarray(in_list) + out = np.vectorize( + lambda x: struct.unpack('> 16 - - -def convert_float_to_uint16(in_list): - if in_list.dtype == np.float32: - new_output = [] - for x in np.nditer(in_list): - new_output.append(np.uint16(copy_bits_from_float_to_uint16(x))) - new_output = np.reshape(new_output, in_list.shape).view(np.uint16) - return new_output - else: - return in_list - - -def convert_uint16_to_float(in_list): - if in_list.dtype == np.uint16: - in_list = np.asarray(in_list) - out = np.vectorize( - lambda x: struct.unpack('