未验证 提交 5ddc31f2 编写于 作者: C Candy2Tang 提交者: GitHub

[xdoctest][task 106] Reformat example code with google style in...

[xdoctest][task 106] Reformat example code with google style in python/paddle/optimizer/adamw.py (#56222)

* [xdoctest][task 106]reformat example code with google style in python/paddle/optimizer/adamw.py

* fix word typo test=docs_preview

* fix word typo test=docs_preview

* fix whitespace test=docs_preview
上级 66597bac
......@@ -41,7 +41,7 @@ class AdamW(Optimizer):
moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
learning\_rate & = learning\_rate *
\frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}
......@@ -55,7 +55,7 @@ class AdamW(Optimizer):
parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
This parameter is required in dygraph mode. And you can specify different options for
different parameter groups such as the learning rate, weight decay, etc,
then the parameters are list of dict. Note that the learning_rate in paramter groups
then the parameters are list of dict. Note that the learning_rate in parameter groups
represents the scale of base learning_rate.
The default value is None in static graph mode, at this time all parameters will be updated.
beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
......@@ -68,15 +68,15 @@ class AdamW(Optimizer):
The default value is 1e-08.
weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
lr_ratio (function|None, optional): If it is not None,
the learning rate will be updated with layerwise learning rate ratio.
the learning rate will be updated with layer-wise learning rate ratio.
Otherwise, the learning rate is the original.
Default: None.
apply_decay_param_fun (function|None, optional): If it is not None,
only tensors that makes apply_decay_param_fun(Tensor.name)==True
will be updated with weight decay. It only works when we want to specify tensors.
Default: None.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three clipping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
......@@ -96,48 +96,50 @@ class AdamW(Optimizer):
Examples:
.. code-block:: python
import paddle
linear = paddle.nn.Linear(10, 10)
inp = paddle.rand([10,10], dtype="float32")
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
opt = paddle.optimizer.AdamW(learning_rate=0.1,
parameters=linear.parameters(),
beta1=beta1,
beta2=beta2,
weight_decay=0.01)
loss.backward()
opt.step()
opt.clear_grad()
#Note that the learning_rate of linear_2 is 0.01.
linear_1 = paddle.nn.Linear(10, 10)
linear_2 = paddle.nn.Linear(10, 10)
inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
out = linear_1(inp)
out = linear_2(out)
loss = paddle.mean(out)
opt = paddle.optimizer.AdamW(
learning_rate=0.1,
parameters=[{
'params': linear_1.parameters()
}, {
'params': linear_2.parameters(),
'weight_decay': 0.001,
'learning_rate': 0.1,
'beta1': 0.8
}],
weight_decay=0.01,
beta1=0.9)
loss.backward()
opt.step()
opt.clear_grad()
>>> import paddle
>>> linear = paddle.nn.Linear(10, 10)
>>> inp = paddle.rand([10,10], dtype="float32")
>>> out = linear(inp)
>>> loss = paddle.mean(out)
>>> beta1 = paddle.to_tensor([0.9], dtype="float32")
>>> beta2 = paddle.to_tensor([0.99], dtype="float32")
>>> opt = paddle.optimizer.AdamW(learning_rate=0.1,
... parameters=linear.parameters(),
... beta1=beta1,
... beta2=beta2,
... weight_decay=0.01
... )
>>> loss.backward()
>>> opt.step()
>>> opt.clear_grad()
>>> # Note that the learning_rate of linear_2 is 0.01.
>>> linear_1 = paddle.nn.Linear(10, 10)
>>> linear_2 = paddle.nn.Linear(10, 10)
>>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
>>> out = linear_1(inp)
>>> out = linear_2(out)
>>> loss = paddle.mean(out)
>>> opt = paddle.optimizer.AdamW(
... learning_rate=0.1,
... parameters=[{
... 'params': linear_1.parameters()
... }, {
... 'params': linear_2.parameters(),
... 'weight_decay': 0.001,
... 'learning_rate': 0.1,
... 'beta1': 0.8
... }],
... weight_decay=0.01,
... beta1=0.9
... )
>>> loss.backward()
>>> opt.step()
>>> opt.clear_grad()
"""
......@@ -541,17 +543,17 @@ class AdamW(Optimizer):
Examples:
.. code-block:: python
import paddle
a = paddle.rand([2,13], dtype="float32")
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
opt = paddle.optimizer.AdamW(learning_rate = 0.01,
parameters = linear.parameters())
out = linear(a)
out.backward()
opt.step()
opt.clear_grad()
>>> import paddle
>>> a = paddle.rand([2,13], dtype="float32")
>>> linear = paddle.nn.Linear(13, 5)
>>> # This can be any optimizer supported by dygraph.
>>> opt = paddle.optimizer.AdamW(learning_rate = 0.01,
... parameters = linear.parameters())
>>> out = linear(a)
>>> out.backward()
>>> opt.step()
>>> opt.clear_grad()
"""
if paddle.fluid.dygraph.base.in_declarative_mode():
self._declarative_step()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册