[xdoctest][task 106] Reformat example code with google style in...

[xdoctest][task 106] Reformat example code with google style in python/paddle/optimizer/adamw.py (#56222) * [xdoctest][task 106]reformat example code with google style in python/paddle/optimizer/adamw.py * fix word typo test=docs_preview * fix word typo test=docs_preview * fix whitespace test=docs_preview

[xdoctest][task 106] Reformat example code with google style in...
[xdoctest][task 106] Reformat example code with google style in python/paddle/optimizer/adamw.py (#56222) * [xdoctest][task 106]reformat example code with google style in python/paddle/optimizer/adamw.py * fix word typo test=docs_preview * fix word typo test=docs_preview * fix whitespace test=docs_preview
5ddc31f2 · Candy2Tang · GitHub · 66597bac · 5ddc31f2
隐藏空白更改
内联并排

Showing with 60 addition and 58 deletion

python/paddle/optimizer/adamw.py python/paddle/optimizer/adamw.py +60 -58

未找到文件。
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -41,7 +41,7 @@ class AdamW(Optimizer):
        moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
-        moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
+        moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
        learning\_rate & = learning\_rate *
            \frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}
@@ -55,7 +55,7 @@ class AdamW(Optimizer):
        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``.
            This parameter is required in dygraph mode. And you can specify different options for
            different parameter groups such as the learning rate, weight decay, etc,
-            then the parameters are list of dict. Note that the learning_rate in paramter groups
+            then the parameters are list of dict. Note that the learning_rate in parameter groups
            represents the scale of base learning_rate.
            The default value is None in static graph mode, at this time all parameters will be updated.
        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
@@ -68,15 +68,15 @@ class AdamW(Optimizer):
            The default value is 1e-08.
        weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
        lr_ratio (function|None, optional): If it is not None,
-            the learning rate will be updated with layerwise learning rate ratio.
+            the learning rate will be updated with layer-wise learning rate ratio.
            Otherwise, the learning rate is the original.
            Default: None.
        apply_decay_param_fun (function|None, optional): If it is not None,
            only tensors that makes apply_decay_param_fun(Tensor.name)==True
            will be updated with weight decay. It only works when we want to specify tensors.
            Default: None.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            some derived class of ``GradientClipBase`` . There are three clipping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
@@ -96,48 +96,50 @@ class AdamW(Optimizer):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            linear = paddle.nn.Linear(10, 10)
+            >>> linear = paddle.nn.Linear(10, 10)
-            inp = paddle.rand([10,10], dtype="float32")
+            >>> inp = paddle.rand([10,10], dtype="float32")
-            out = linear(inp)
+            >>> out = linear(inp)
-            loss = paddle.mean(out)
+            >>> loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
-            opt = paddle.optimizer.AdamW(learning_rate=0.1,
+            >>> opt = paddle.optimizer.AdamW(learning_rate=0.1,
-                    parameters=linear.parameters(),
+            ...         parameters=linear.parameters(),
-                    beta1=beta1,
+            ...         beta1=beta1,
-                    beta2=beta2,
+            ...         beta2=beta2,
-                    weight_decay=0.01)
+            ...         weight_decay=0.01
-            loss.backward()
+            ... )
-            opt.step()
+            >>> loss.backward()
-            opt.clear_grad()
+            >>> opt.step()
+            >>> opt.clear_grad()
-            #Note that the learning_rate of linear_2 is 0.01.
-            linear_1 = paddle.nn.Linear(10, 10)
+            >>> # Note that the learning_rate of linear_2 is 0.01.
-            linear_2 = paddle.nn.Linear(10, 10)
+            >>> linear_1 = paddle.nn.Linear(10, 10)
-            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            >>> linear_2 = paddle.nn.Linear(10, 10)
-            out = linear_1(inp)
+            >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
-            out = linear_2(out)
+            >>> out = linear_1(inp)
-            loss = paddle.mean(out)
+            >>> out = linear_2(out)
-            opt = paddle.optimizer.AdamW(
+            >>> loss = paddle.mean(out)
-                learning_rate=0.1,
+            >>> opt = paddle.optimizer.AdamW(
-                parameters=[{
+            ...     learning_rate=0.1,
-                    'params': linear_1.parameters()
+            ...     parameters=[{
-                }, {
+            ...         'params': linear_1.parameters()
-                    'params': linear_2.parameters(),
+            ...     }, {
-                    'weight_decay': 0.001,
+            ...         'params': linear_2.parameters(),
-                    'learning_rate': 0.1,
+            ...         'weight_decay': 0.001,
-                    'beta1': 0.8
+            ...         'learning_rate': 0.1,
-                }],
+            ...         'beta1': 0.8
-                weight_decay=0.01,
+            ...     }],
-                beta1=0.9)
+            ...     weight_decay=0.01,
-            loss.backward()
+            ...     beta1=0.9
-            opt.step()
+            ... )
-            opt.clear_grad()
+            >>> loss.backward()
+            >>> opt.step()
+            >>> opt.clear_grad()
    """
@@ -541,17 +543,17 @@ class AdamW(Optimizer):
        Examples:
            .. code-block:: python
-                import paddle
+                >>> import paddle
-                a = paddle.rand([2,13], dtype="float32")
+                >>> a = paddle.rand([2,13], dtype="float32")
-                linear = paddle.nn.Linear(13, 5)
+                >>> linear = paddle.nn.Linear(13, 5)
-                # This can be any optimizer supported by dygraph.
+                >>> # This can be any optimizer supported by dygraph.
-                opt = paddle.optimizer.AdamW(learning_rate = 0.01,
+                >>> opt = paddle.optimizer.AdamW(learning_rate = 0.01,
-                                            parameters = linear.parameters())
+                ...                             parameters = linear.parameters())
-                out = linear(a)
+                >>> out = linear(a)
-                out.backward()
+                >>> out.backward()
-                opt.step()
+                >>> opt.step()
-                opt.clear_grad()
+                >>> opt.clear_grad()
        """
        if paddle.fluid.dygraph.base.in_declarative_mode():
            self._declarative_step()