未验证 提交 8f83d5d8 编写于 作者: P pangyoki 提交者: GitHub

fix AMP auto_cast and grad_scaler En doc (#28177)

* fix AMP auto_cast and grad_scaler En doc

* fix indentation problem

* change Conv2d to Conv2D
上级 b63e0ccb
......@@ -23,13 +23,17 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
If enabled, the input data type (float32 or float16) of each operator is decided
by autocast algorithm for better performance.
Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in
Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
imperative mode.
Args:
enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
custom_white_list(set|list, optional): The custom white_list.
custom_black_list(set|list, optional): The custom black_list.
custom_white_list(set|list, optional): The custom white_list. It's the set of ops that support
fp16 calculation and are considered numerically-safe and performance-critical. These ops
will be converted to fp16.
custom_black_list(set|list, optional): The custom black_list. The set of ops that support fp16
calculation and are considered numerically-dangerous and whose effects may also be
observed in downstream ops. These ops will not be converted to fp16.
Examples:
......@@ -48,5 +52,15 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
conv = conv2d(data)
print(conv.dtype) # FP32
with paddle.amp.auto_cast(custom_black_list={'conv2d'}):
conv = conv2d(data)
print(conv.dtype) # FP32
a = paddle.rand([2,3])
b = paddle.rand([2,3])
with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
c = a + b
print(c.dtype) # FP16
"""
return amp_guard(enable, custom_white_list, custom_black_list)
......@@ -19,12 +19,12 @@ __all__ = ['GradScaler']
class GradScaler(AmpScaler):
"""
GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph
mode. It controls the scaling of loss, helps avoiding numerical overflow.
GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
It controls the scaling of loss, helps avoiding numerical overflow.
The object of this class has two methods `scale()`, `minimize()`.
`scale()` is used to multiply the loss by a scale ratio.
`minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
`minimize()` is similar as `optimizer.minimize()`, performs parameters updating.
Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
dynamic graph mode.
......@@ -42,24 +42,24 @@ class GradScaler(AmpScaler):
accumulated steps with nan or inf gradients. Default is 2.
use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
Returns:
An AmpScaler object.
An GradScaler object.
Examples:
.. code-block:: python
.. code-block:: python
import paddle
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
"""
def __init__(self,
......@@ -68,7 +68,7 @@ class GradScaler(AmpScaler):
incr_ratio=2.0,
decr_ratio=0.5,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=1,
decr_every_n_nan_or_inf=2,
use_dynamic_loss_scaling=True):
super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio,
decr_ratio, incr_every_n_steps,
......@@ -88,24 +88,24 @@ class GradScaler(AmpScaler):
Examples:
.. code-block:: python
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
"""
return super(GradScaler, self).scale(var)
def minimize(self, optimizer, *args, **kwargs):
"""
This function is similar as `Optimizer.minimize()`, which performs parameters updating.
This function is similar as `optimizer.minimize()`, which performs parameters updating.
If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
......@@ -115,22 +115,22 @@ class GradScaler(AmpScaler):
Args:
optimizer(Optimizer): The optimizer used to update parameters.
args: Arguments, which will be forward to `optimizer.minimize()`.
kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
kwargs: Keyword arguments, which will be forward to `optimizer.minimize()`.
Examples:
.. code-block:: python
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
import paddle
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
with paddle.amp.auto_cast():
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss) # scale the loss
scaled.backward() # do backward
scaler.minimize(optimizer, scaled) # update parameters
"""
return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册