diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index 1a6725b0759b01f33e3f02ee9f36be8eedeba977..bf3d60ae45c8cf0a1e8353603941287fa298b4f9 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -24,10 +24,10 @@ from .fp16_lists import AutoMixedPrecisionLists __all__ = ["decorate"] -class OptimizerWithMixedPrecison(object): +class OptimizerWithMixedPrecision(object): """ Optimizer with mixed-precision (MP) training. This is a wrapper of a common - optimizer, plus the support of mixed-precision pretraining. The object + optimizer, plus the support of mixed-precision pre-training. The object of this class almost has the same behavior as the common optimizer, with the methods `minimize()`, `backward()`, `apply_gradients()` implemented. Additionally, it enables the MP training automatically, i.e, the creation @@ -116,7 +116,7 @@ class OptimizerWithMixedPrecison(object): no_grad_set=None, callbacks=None): """ - Backward propogation or auto differentiation for gradients' computation. + Backward propagation or auto differentiation for gradients' computation. Args: loss (Variable): The loss Variable to minimize. @@ -124,7 +124,7 @@ class OptimizerWithMixedPrecison(object): parameters in `parameter_list`. parameter_list (list|None): A list of Variables to update. no_grad_set (set|None): A set of Variables should be ignored. - callbacks (list|None): A list of callables to run when appending + callbacks (list|None): A list of callable objects to run when appending backward operator for one parameter. Returns: @@ -136,6 +136,8 @@ class OptimizerWithMixedPrecison(object): self._params_grads = self._optimizer.backward( self._scaled_loss, startup_program, parameter_list, no_grad_set, callbacks) + # Change the op_role_var attr for some ops, so that gradients + # transferred across GPUs can be FP16. update_role_var_grad(self._train_program, self._params_grads) scaled_params_grads = [] for p, g in self._params_grads: @@ -257,7 +259,7 @@ def decorate(optimizer, """ if amp_lists is None: amp_lists = AutoMixedPrecisionLists() - mp_optimizer = OptimizerWithMixedPrecison( + mp_optimizer = OptimizerWithMixedPrecision( optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 0c7e623d469f3a6d2382029e962fcedce9bd1550..1f301b7148d005d4e3d5d272fd78f78af6dc1e6a 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -22,7 +22,7 @@ class AutoMixedPrecisionLists(object): AutoMixedPrecisionLists is a class for black/white list. It can update pre-defined black list and white list according to users' custom black white lists. The lists are used for an algorithm which determines op's - exectuion mode (fp32 or fp16). + execution mode (fp32 or fp16). Args: custom_white_list (set): Users' custom white list. @@ -95,7 +95,7 @@ black_list = { # This set contains two types of ops. All ops supported fp16 calculation. One # of two types is considered numerically-safe, but may be made unsafe by an -# updtream blacklist op. Another type do not have numerically-significant +# upstream blacklist op. Another type do not have numerically-significant # effects, like stack, flatten2. gray_list = { 'elementwise_add', @@ -139,7 +139,7 @@ gray_list = { ''' # The set of ops that don't support fp16 calculation unsupported_fp16_list = { - # from python/paddle/fluid/layers/io.py + # from python/paddle/fluid/layers/io.py 'send', 'send_barrier', 'recv', diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 1a4eae3e6100ef7084f3c5715da8706f9d156dcb..78f16c39db9daba15a5c66260bf621914a707071 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -16,24 +16,6 @@ from __future__ import print_function from ... import core from ... import layers -from ... import framework - - -def append_cast_op(i, o, prog): - """ - Append a cast op in a given Program to cast input `i` to data type `o.dtype`. - - Args: - i (Variable): The input Variable. - o (Variable): The output Variable. - prog (Program): The Program to append cast op. - """ - prog.global_block().append_op( - type="cast", - inputs={"X": i}, - outputs={"Out": o}, - attrs={"in_dtype": i.dtype, - "out_dtype": o.dtype}) def _rename_arg(op, old_name, new_name): @@ -75,7 +57,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): op (Operator): The operator to insert cast op. idx (int): The index of current operator. src_dtype (VarType): The input variable dtype of cast op. - desr_dtype (VarType): The output variable dtype of cast op. + dest_dtype (VarType): The output variable dtype of cast op. Returns: num_cast_op (int): The number of cast ops that have been inserted. @@ -261,7 +243,7 @@ def rewrite_program(main_prog, amp_lists): def update_role_var_grad(main_prog, params_grads): """ Update op_role_var attr for some ops to make sure the gradients - transfered across gpus is FP16. + transferred across GPUs is FP16. 1. Check whether the op that outputs gradient is cast or not. 2. If op is cast and gradient is FP32, remove the op_role_var and find the prev op which outputs FP16 gradient @@ -293,7 +275,8 @@ def update_role_var_grad(main_prog, params_grads): attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name)) op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val) - # maximize the allreduce overlap + # Maximize the all_reduce overlap, and perform the cast + # operation after gradients transfer. op._set_attr('op_role', OPTIMIZE) @@ -303,7 +286,7 @@ def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, """ Update loss scaling according to overall gradients. If all gradients is finite after incr_every_n_steps, loss scaling will increase by incr_ratio. - Otherwisw, loss scaling will decrease by decr_ratio after + Otherwise, loss scaling will decrease by decr_ratio after decr_every_n_nan_or_inf steps and each step some gradients are infinite. Args: diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py index 8e7cee1fb692810dab37ed9375b6cadae41f97e5..93d30d3d74d72fbeccf75c25121abae655934a16 100644 --- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py +++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py @@ -23,7 +23,7 @@ from paddle.fluid.optimizer import SGD from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker -from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecison +from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision class Mode: @@ -259,7 +259,7 @@ class DistributedOptimizer(object): def __init__(self, optimizer, strategy=None): if not isinstance(optimizer, SGD.__bases__) \ - and not isinstance(optimizer, OptimizerWithMixedPrecison): + and not isinstance(optimizer, OptimizerWithMixedPrecision): raise TypeError("optimizer must be an instance of Optimizer") self._optimizer = optimizer