diff --git a/mindspore/nn/optim/momentum.py b/mindspore/nn/optim/momentum.py index 6b501232c892cab0040526c9a42e86a03b94de6f..c22cce2c001d0e8541ed6fa9b384a8b6b866178e 100755 --- a/mindspore/nn/optim/momentum.py +++ b/mindspore/nn/optim/momentum.py @@ -56,12 +56,12 @@ class Momentum(Optimizer): .. math:: v_{t} = v_{t-1} \ast u + gradients - If use_nesterov is True: - .. math:: + If use_nesterov is True: + .. math:: p_{t} = p_{t-1} - (grad \ast lr + v_{t} \ast u \ast lr) - If use_nesterov is Flase: - .. math:: + If use_nesterov is Flase: + .. math:: p_{t} = p_{t-1} - lr \ast v_{t} Here: where grad, lr, p, v and u denote the gradients, learning_rate, params, moments, and momentum respectively. diff --git a/mindspore/nn/optim/sgd.py b/mindspore/nn/optim/sgd.py index 216f2112f36979e4622bdfbbfab3b5b9b223ac44..bf9b8f559ea98870ec231a741b15e48e4383beaa 100755 --- a/mindspore/nn/optim/sgd.py +++ b/mindspore/nn/optim/sgd.py @@ -49,12 +49,12 @@ class SGD(Optimizer): .. math:: v_{t+1} = u \ast v_{t} + gradient \ast (1-dampening) - If nesterov is True: - .. math:: + If nesterov is True: + .. math:: p_{t+1} = p_{t} - lr \ast (gradient + u \ast v_{t+1}) - If nesterov is Flase: - .. math:: + If nesterov is Flase: + .. math:: p_{t+1} = p_{t} - lr \ast v_{t+1} To be noticed, for the first step, v_{t+1} = gradient diff --git a/mindspore/nn/wrap/cell_wrapper.py b/mindspore/nn/wrap/cell_wrapper.py index 4e989a56b29c9d7343a949850a542c8b09833cb1..d0073a4929ed258669b52a55d5b5935ec5a5e12e 100644 --- a/mindspore/nn/wrap/cell_wrapper.py +++ b/mindspore/nn/wrap/cell_wrapper.py @@ -82,7 +82,7 @@ class WithGradCell(Cell): Wraps the network with backward cell to compute gradients. A network with a loss function is necessary as argument. If loss function in None, the network must be a wrapper of network and loss function. This - Cell accepts *inputs as inputs and returns gradients for each trainable parameter. + Cell accepts '*inputs' as inputs and returns gradients for each trainable parameter. Note: Run in PyNative mode.