提交 8d92b36d 编写于 作者: W WangXi 提交者: gongweibao

Refine document of DGCMomentumOptimizer (#19960)

Refine document of DGCMomentumOptimizer
上级 5cef7a2f
...@@ -995,7 +995,7 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'lo ...@@ -995,7 +995,7 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'lo
paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
paddle.fluid.optimizer.DGCMomentumOptimizer ('paddle.fluid.optimizer.DGCMomentumOptimizer', ('document', 'c0384e036f5c78c569f0e2b266812c0f')) paddle.fluid.optimizer.DGCMomentumOptimizer ('paddle.fluid.optimizer.DGCMomentumOptimizer', ('document', 'facdbef1b4871d0cf74c736ff2e94720'))
paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
......
...@@ -811,8 +811,7 @@ class MomentumOptimizer(Optimizer): ...@@ -811,8 +811,7 @@ class MomentumOptimizer(Optimizer):
class DGCMomentumOptimizer(MomentumOptimizer): class DGCMomentumOptimizer(MomentumOptimizer):
""" """
DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
Original paper is https://arxiv.org/abs/1712.01887
DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\ DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\
only gradients larger than a threshold are transmitted. only gradients larger than a threshold are transmitted.
...@@ -821,7 +820,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): ...@@ -821,7 +820,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
Eventually, these gradients become large enough to be transmitted. Eventually, these gradients become large enough to be transmitted.
Thus, DGC sends the large gradients immediately but eventually send all of the gradients over time. Thus, DGC sends the large gradients immediately but eventually sends all of the gradients over time.
To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance. To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance.
...@@ -832,23 +831,27 @@ class DGCMomentumOptimizer(MomentumOptimizer): ...@@ -832,23 +831,27 @@ class DGCMomentumOptimizer(MomentumOptimizer):
1. Compress the gradient by get TopK import value from tensor \ 1. Compress the gradient by get TopK import value from tensor \
and use it for allreduce to reduce network bandwidth. and use it for allreduce to reduce network bandwidth.
2. Call momentum to optimize on the cost. 2. Call momentum to optimize the cost.
Args: Args:
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): The learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. It can be a float value or a Variable with one float value as a data element.
momentum (float): Momentum factor. momentum (float): Momentum factor.
rampup_begin_step (int): The beginning step from which gradient compression is implemented. rampup_begin_step (int): The beginning step from which gradient compression is implemented.
rampup_step (int): How long it use the sparsity periods. Default is 1. rampup_step (int): Time steps used in sparsity warm-up periods. Default is 1.
for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \ For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \ it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. \
it will use 0.999 then and after. And when reach sparsity array ends, it will use 0.999 then and after.
sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \
use_nesterov (bool): Enables Nesterov momentum. True means use nesterov. Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \
local_grad_clip_norm (float): Clip norm value if needed. the top [1%, 0.1%] important element will be transmitted.
num_trainers: The number of training nodes. use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. local_grad_clip_norm (float, optional): Local gradient clip norm value. Optional, default is None, represent no need clip.
name: An optional name prefix. num_trainers (int, optional): The number of training nodes. Optional, default is None.
regularization (WeightDecayRegularizer, optional): A Regularizer, such as \
:ref:`api_fluid_regularizer_L2DecayRegularizer`. Optional, default is None.
name (str, optional): This parameter is used by developers to print debugging information. \
For details, please refer to :ref:`api_guide_Name`. Default is None.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册