diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 63c7d999fde777f9275f3f8f17f05a55d459c4b0..441bc31b93684f94fd1dc36183679f493c03ada0 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -23,13 +23,17 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): If enabled, the input data type (float32 or float16) of each operator is decided by autocast algorithm for better performance. - Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in + Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in imperative mode. Args: enable(bool, optional): Enable auto-mixed-precision or not. Default is True. - custom_white_list(set|list, optional): The custom white_list. - custom_black_list(set|list, optional): The custom black_list. + custom_white_list(set|list, optional): The custom white_list. It's the set of ops that support + fp16 calculation and are considered numerically-safe and performance-critical. These ops + will be converted to fp16. + custom_black_list(set|list, optional): The custom black_list. The set of ops that support fp16 + calculation and are considered numerically-dangerous and whose effects may also be + observed in downstream ops. These ops will not be converted to fp16. Examples: @@ -48,5 +52,15 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): conv = conv2d(data) print(conv.dtype) # FP32 + with paddle.amp.auto_cast(custom_black_list={'conv2d'}): + conv = conv2d(data) + print(conv.dtype) # FP32 + + a = paddle.rand([2,3]) + b = paddle.rand([2,3]) + with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}): + c = a + b + print(c.dtype) # FP16 + """ return amp_guard(enable, custom_white_list, custom_black_list) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index e3cd05dcb30a8db6cd8c9c94e36c3f8776447be1..5ae04042c87cef561142a23196fac3c3e3709871 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -19,12 +19,12 @@ __all__ = ['GradScaler'] class GradScaler(AmpScaler): """ - GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph - mode. It controls the scaling of loss, helps avoiding numerical overflow. + GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. + It controls the scaling of loss, helps avoiding numerical overflow. The object of this class has two methods `scale()`, `minimize()`. `scale()` is used to multiply the loss by a scale ratio. - `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating. + `minimize()` is similar as `optimizer.minimize()`, performs parameters updating. Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in dynamic graph mode. @@ -42,24 +42,24 @@ class GradScaler(AmpScaler): accumulated steps with nan or inf gradients. Default is 2. use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True. Returns: - An AmpScaler object. + An GradScaler object. Examples: - .. code-block:: python + .. code-block:: python - import paddle + import paddle - model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) - optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - data = paddle.rand([10, 3, 32, 32]) - with paddle.amp.auto_cast(): - conv = model(data) - loss = paddle.mean(conv) - scaled = scaler.scale(loss) # scale the loss - scaled.backward() # do backward - scaler.minimize(optimizer, scaled) # update parameters + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.minimize(optimizer, scaled) # update parameters """ def __init__(self, @@ -68,7 +68,7 @@ class GradScaler(AmpScaler): incr_ratio=2.0, decr_ratio=0.5, incr_every_n_steps=1000, - decr_every_n_nan_or_inf=1, + decr_every_n_nan_or_inf=2, use_dynamic_loss_scaling=True): super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio, decr_ratio, incr_every_n_steps, @@ -88,24 +88,24 @@ class GradScaler(AmpScaler): Examples: .. code-block:: python - import paddle - - model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) - optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - data = paddle.rand([10, 3, 32, 32]) - with paddle.amp.auto_cast(): - conv = model(data) - loss = paddle.mean(conv) - scaled = scaler.scale(loss) # scale the loss - scaled.backward() # do backward - scaler.minimize(optimizer, scaled) # update parameters + import paddle + + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.minimize(optimizer, scaled) # update parameters """ return super(GradScaler, self).scale(var) def minimize(self, optimizer, *args, **kwargs): """ - This function is similar as `Optimizer.minimize()`, which performs parameters updating. + This function is similar as `optimizer.minimize()`, which performs parameters updating. If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. @@ -115,22 +115,22 @@ class GradScaler(AmpScaler): Args: optimizer(Optimizer): The optimizer used to update parameters. args: Arguments, which will be forward to `optimizer.minimize()`. - kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`. + kwargs: Keyword arguments, which will be forward to `optimizer.minimize()`. Examples: .. code-block:: python - import paddle - - model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) - optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - data = paddle.rand([10, 3, 32, 32]) - with paddle.amp.auto_cast(): - conv = model(data) - loss = paddle.mean(conv) - scaled = scaler.scale(loss) # scale the loss - scaled.backward() # do backward - scaler.minimize(optimizer, scaled) # update parameters + import paddle + + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.minimize(optimizer, scaled) # update parameters """ return super(GradScaler, self).minimize(optimizer, *args, **kwargs)