未验证 提交 be2e3e67 编写于 作者: Z Zhen Wang 提交者: GitHub

Fix some typos in AMP. (#21354)

* fix some typos in AMP. test=develop

* delete useless codes. test=develop
上级 afb13484
...@@ -24,10 +24,10 @@ from .fp16_lists import AutoMixedPrecisionLists ...@@ -24,10 +24,10 @@ from .fp16_lists import AutoMixedPrecisionLists
__all__ = ["decorate"] __all__ = ["decorate"]
class OptimizerWithMixedPrecison(object): class OptimizerWithMixedPrecision(object):
""" """
Optimizer with mixed-precision (MP) training. This is a wrapper of a common Optimizer with mixed-precision (MP) training. This is a wrapper of a common
optimizer, plus the support of mixed-precision pretraining. The object optimizer, plus the support of mixed-precision pre-training. The object
of this class almost has the same behavior as the common optimizer, with the of this class almost has the same behavior as the common optimizer, with the
methods `minimize()`, `backward()`, `apply_gradients()` implemented. methods `minimize()`, `backward()`, `apply_gradients()` implemented.
Additionally, it enables the MP training automatically, i.e, the creation Additionally, it enables the MP training automatically, i.e, the creation
...@@ -116,7 +116,7 @@ class OptimizerWithMixedPrecison(object): ...@@ -116,7 +116,7 @@ class OptimizerWithMixedPrecison(object):
no_grad_set=None, no_grad_set=None,
callbacks=None): callbacks=None):
""" """
Backward propogation or auto differentiation for gradients' computation. Backward propagation or auto differentiation for gradients' computation.
Args: Args:
loss (Variable): The loss Variable to minimize. loss (Variable): The loss Variable to minimize.
...@@ -124,7 +124,7 @@ class OptimizerWithMixedPrecison(object): ...@@ -124,7 +124,7 @@ class OptimizerWithMixedPrecison(object):
parameters in `parameter_list`. parameters in `parameter_list`.
parameter_list (list|None): A list of Variables to update. parameter_list (list|None): A list of Variables to update.
no_grad_set (set|None): A set of Variables should be ignored. no_grad_set (set|None): A set of Variables should be ignored.
callbacks (list|None): A list of callables to run when appending callbacks (list|None): A list of callable objects to run when appending
backward operator for one parameter. backward operator for one parameter.
Returns: Returns:
...@@ -136,6 +136,8 @@ class OptimizerWithMixedPrecison(object): ...@@ -136,6 +136,8 @@ class OptimizerWithMixedPrecison(object):
self._params_grads = self._optimizer.backward( self._params_grads = self._optimizer.backward(
self._scaled_loss, startup_program, parameter_list, no_grad_set, self._scaled_loss, startup_program, parameter_list, no_grad_set,
callbacks) callbacks)
# Change the op_role_var attr for some ops, so that gradients
# transferred across GPUs can be FP16.
update_role_var_grad(self._train_program, self._params_grads) update_role_var_grad(self._train_program, self._params_grads)
scaled_params_grads = [] scaled_params_grads = []
for p, g in self._params_grads: for p, g in self._params_grads:
...@@ -257,7 +259,7 @@ def decorate(optimizer, ...@@ -257,7 +259,7 @@ def decorate(optimizer,
""" """
if amp_lists is None: if amp_lists is None:
amp_lists = AutoMixedPrecisionLists() amp_lists = AutoMixedPrecisionLists()
mp_optimizer = OptimizerWithMixedPrecison( mp_optimizer = OptimizerWithMixedPrecision(
optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling, optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio) incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
......
...@@ -22,7 +22,7 @@ class AutoMixedPrecisionLists(object): ...@@ -22,7 +22,7 @@ class AutoMixedPrecisionLists(object):
AutoMixedPrecisionLists is a class for black/white list. It can update AutoMixedPrecisionLists is a class for black/white list. It can update
pre-defined black list and white list according to users' custom black pre-defined black list and white list according to users' custom black
white lists. The lists are used for an algorithm which determines op's white lists. The lists are used for an algorithm which determines op's
exectuion mode (fp32 or fp16). execution mode (fp32 or fp16).
Args: Args:
custom_white_list (set): Users' custom white list. custom_white_list (set): Users' custom white list.
...@@ -95,7 +95,7 @@ black_list = { ...@@ -95,7 +95,7 @@ black_list = {
# This set contains two types of ops. All ops supported fp16 calculation. One # This set contains two types of ops. All ops supported fp16 calculation. One
# of two types is considered numerically-safe, but may be made unsafe by an # of two types is considered numerically-safe, but may be made unsafe by an
# updtream blacklist op. Another type do not have numerically-significant # upstream blacklist op. Another type do not have numerically-significant
# effects, like stack, flatten2. # effects, like stack, flatten2.
gray_list = { gray_list = {
'elementwise_add', 'elementwise_add',
......
...@@ -16,24 +16,6 @@ from __future__ import print_function ...@@ -16,24 +16,6 @@ from __future__ import print_function
from ... import core from ... import core
from ... import layers from ... import layers
from ... import framework
def append_cast_op(i, o, prog):
"""
Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
Args:
i (Variable): The input Variable.
o (Variable): The output Variable.
prog (Program): The Program to append cast op.
"""
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={"in_dtype": i.dtype,
"out_dtype": o.dtype})
def _rename_arg(op, old_name, new_name): def _rename_arg(op, old_name, new_name):
...@@ -75,7 +57,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): ...@@ -75,7 +57,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
op (Operator): The operator to insert cast op. op (Operator): The operator to insert cast op.
idx (int): The index of current operator. idx (int): The index of current operator.
src_dtype (VarType): The input variable dtype of cast op. src_dtype (VarType): The input variable dtype of cast op.
desr_dtype (VarType): The output variable dtype of cast op. dest_dtype (VarType): The output variable dtype of cast op.
Returns: Returns:
num_cast_op (int): The number of cast ops that have been inserted. num_cast_op (int): The number of cast ops that have been inserted.
...@@ -261,7 +243,7 @@ def rewrite_program(main_prog, amp_lists): ...@@ -261,7 +243,7 @@ def rewrite_program(main_prog, amp_lists):
def update_role_var_grad(main_prog, params_grads): def update_role_var_grad(main_prog, params_grads):
""" """
Update op_role_var attr for some ops to make sure the gradients Update op_role_var attr for some ops to make sure the gradients
transfered across gpus is FP16. transferred across GPUs is FP16.
1. Check whether the op that outputs gradient is cast or not. 1. Check whether the op that outputs gradient is cast or not.
2. If op is cast and gradient is FP32, remove the op_role_var 2. If op is cast and gradient is FP32, remove the op_role_var
and find the prev op which outputs FP16 gradient and find the prev op which outputs FP16 gradient
...@@ -293,7 +275,8 @@ def update_role_var_grad(main_prog, params_grads): ...@@ -293,7 +275,8 @@ def update_role_var_grad(main_prog, params_grads):
attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name)) attr_val.extend(op_for_fp16_grad.attr(op_role_var_attr_name))
op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val) op_for_fp16_grad._set_attr(op_role_var_attr_name, attr_val)
# maximize the allreduce overlap # Maximize the all_reduce overlap, and perform the cast
# operation after gradients transfer.
op._set_attr('op_role', OPTIMIZE) op._set_attr('op_role', OPTIMIZE)
...@@ -303,7 +286,7 @@ def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, ...@@ -303,7 +286,7 @@ def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
""" """
Update loss scaling according to overall gradients. If all gradients is Update loss scaling according to overall gradients. If all gradients is
finite after incr_every_n_steps, loss scaling will increase by incr_ratio. finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
Otherwisw, loss scaling will decrease by decr_ratio after Otherwise, loss scaling will decrease by decr_ratio after
decr_every_n_nan_or_inf steps and each step some gradients are infinite. decr_every_n_nan_or_inf steps and each step some gradients are infinite.
Args: Args:
......
...@@ -23,7 +23,7 @@ from paddle.fluid.optimizer import SGD ...@@ -23,7 +23,7 @@ from paddle.fluid.optimizer import SGD
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecison from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision
class Mode: class Mode:
...@@ -259,7 +259,7 @@ class DistributedOptimizer(object): ...@@ -259,7 +259,7 @@ class DistributedOptimizer(object):
def __init__(self, optimizer, strategy=None): def __init__(self, optimizer, strategy=None):
if not isinstance(optimizer, SGD.__bases__) \ if not isinstance(optimizer, SGD.__bases__) \
and not isinstance(optimizer, OptimizerWithMixedPrecison): and not isinstance(optimizer, OptimizerWithMixedPrecision):
raise TypeError("optimizer must be an instance of Optimizer") raise TypeError("optimizer must be an instance of Optimizer")
self._optimizer = optimizer self._optimizer = optimizer
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册