未验证 提交 5ddc31f2 编写于 作者: C Candy2Tang 提交者: GitHub

[xdoctest][task 106] Reformat example code with google style in...

[xdoctest][task 106] Reformat example code with google style in python/paddle/optimizer/adamw.py (#56222)

* [xdoctest][task 106]reformat example code with google style in python/paddle/optimizer/adamw.py

* fix word typo test=docs_preview

* fix word typo test=docs_preview

* fix whitespace test=docs_preview
上级 66597bac
...@@ -41,7 +41,7 @@ class AdamW(Optimizer): ...@@ -41,7 +41,7 @@ class AdamW(Optimizer):
moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
learning\_rate & = learning\_rate * learning\_rate & = learning\_rate *
\frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t} \frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}
...@@ -55,7 +55,7 @@ class AdamW(Optimizer): ...@@ -55,7 +55,7 @@ class AdamW(Optimizer):
parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
This parameter is required in dygraph mode. And you can specify different options for This parameter is required in dygraph mode. And you can specify different options for
different parameter groups such as the learning rate, weight decay, etc, different parameter groups such as the learning rate, weight decay, etc,
then the parameters are list of dict. Note that the learning_rate in paramter groups then the parameters are list of dict. Note that the learning_rate in parameter groups
represents the scale of base learning_rate. represents the scale of base learning_rate.
The default value is None in static graph mode, at this time all parameters will be updated. The default value is None in static graph mode, at this time all parameters will be updated.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
...@@ -68,15 +68,15 @@ class AdamW(Optimizer): ...@@ -68,15 +68,15 @@ class AdamW(Optimizer):
The default value is 1e-08. The default value is 1e-08.
weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01. weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
lr_ratio (function|None, optional): If it is not None, lr_ratio (function|None, optional): If it is not None,
the learning rate will be updated with layerwise learning rate ratio. the learning rate will be updated with layer-wise learning rate ratio.
Otherwise, the learning rate is the original. Otherwise, the learning rate is the original.
Default: None. Default: None.
apply_decay_param_fun (function|None, optional): If it is not None, apply_decay_param_fun (function|None, optional): If it is not None,
only tensors that makes apply_decay_param_fun(Tensor.name)==True only tensors that makes apply_decay_param_fun(Tensor.name)==True
will be updated with weight decay. It only works when we want to specify tensors. will be updated with weight decay. It only works when we want to specify tensors.
Default: None. Default: None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies some derived class of ``GradientClipBase`` . There are three clipping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
...@@ -96,48 +96,50 @@ class AdamW(Optimizer): ...@@ -96,48 +96,50 @@ class AdamW(Optimizer):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle >>> import paddle
linear = paddle.nn.Linear(10, 10) >>> linear = paddle.nn.Linear(10, 10)
inp = paddle.rand([10,10], dtype="float32") >>> inp = paddle.rand([10,10], dtype="float32")
out = linear(inp) >>> out = linear(inp)
loss = paddle.mean(out) >>> loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32") >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32") >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
opt = paddle.optimizer.AdamW(learning_rate=0.1, >>> opt = paddle.optimizer.AdamW(learning_rate=0.1,
parameters=linear.parameters(), ... parameters=linear.parameters(),
beta1=beta1, ... beta1=beta1,
beta2=beta2, ... beta2=beta2,
weight_decay=0.01) ... weight_decay=0.01
loss.backward() ... )
opt.step() >>> loss.backward()
opt.clear_grad() >>> opt.step()
>>> opt.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10) >>> # Note that the learning_rate of linear_2 is 0.01.
linear_2 = paddle.nn.Linear(10, 10) >>> linear_1 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) >>> linear_2 = paddle.nn.Linear(10, 10)
out = linear_1(inp) >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_2(out) >>> out = linear_1(inp)
loss = paddle.mean(out) >>> out = linear_2(out)
opt = paddle.optimizer.AdamW( >>> loss = paddle.mean(out)
learning_rate=0.1, >>> opt = paddle.optimizer.AdamW(
parameters=[{ ... learning_rate=0.1,
'params': linear_1.parameters() ... parameters=[{
}, { ... 'params': linear_1.parameters()
'params': linear_2.parameters(), ... }, {
'weight_decay': 0.001, ... 'params': linear_2.parameters(),
'learning_rate': 0.1, ... 'weight_decay': 0.001,
'beta1': 0.8 ... 'learning_rate': 0.1,
}], ... 'beta1': 0.8
weight_decay=0.01, ... }],
beta1=0.9) ... weight_decay=0.01,
loss.backward() ... beta1=0.9
opt.step() ... )
opt.clear_grad() >>> loss.backward()
>>> opt.step()
>>> opt.clear_grad()
""" """
...@@ -541,17 +543,17 @@ class AdamW(Optimizer): ...@@ -541,17 +543,17 @@ class AdamW(Optimizer):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle >>> import paddle
a = paddle.rand([2,13], dtype="float32") >>> a = paddle.rand([2,13], dtype="float32")
linear = paddle.nn.Linear(13, 5) >>> linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph. >>> # This can be any optimizer supported by dygraph.
opt = paddle.optimizer.AdamW(learning_rate = 0.01, >>> opt = paddle.optimizer.AdamW(learning_rate = 0.01,
parameters = linear.parameters()) ... parameters = linear.parameters())
out = linear(a) >>> out = linear(a)
out.backward() >>> out.backward()
opt.step() >>> opt.step()
opt.clear_grad() >>> opt.clear_grad()
""" """
if paddle.fluid.dygraph.base.in_declarative_mode(): if paddle.fluid.dygraph.base.in_declarative_mode():
self._declarative_step() self._declarative_step()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册