diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index a3d2f61623dccd01f4ec760cc07236dcc20a105f..d37e90b4695d03b5c9caa71c65c8624e558d1065 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -44,7 +44,7 @@ class OptimizerWithMixedPrecision(object): Args: optimizer (Optimizer): A common Optimizer object. - amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object. + amp_lists (CustomOpLists): An CustomOpLists object. init_loss_scaling (float): The initial loss scaling factor. use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling. incr_every_n_steps(int): Increases loss scaling every n consecutive @@ -196,12 +196,56 @@ class OptimizerWithMixedPrecision(object): Init the amp training, such as cast fp32 parameters to fp16 type. Args: - place(CPUPlace|CUDAPlace): place is used to initialize + place(CUDAPlace): place is used to initialize fp16 parameters with fp32 values. scope(Scope): The scope is used to find fp32 parameters. test_program(Program): The program is used for testing. use_fp16_test(bool): Whether to use fp16 testing. + Examples: + .. code-block:: python + + import numpy as np + import paddle + import paddle.nn.functional as F + paddle.enable_static() + + def run_example_code(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + # 1) Use fp16_guard to control the range of fp16 kernels used. + with paddle.static.amp.fp16_guard(): + bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + pool = F.max_pool2d(bn, kernel_size=2, stride=2) + hidden = paddle.static.nn.fc(pool, size=10) + loss = paddle.mean(hidden) + # 2) Create the optimizer and set `multi_precision` to True. + # Setting `multi_precision` to True can avoid the poor accuracy + # or the slow convergence in a way. + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) + # 3) These ops in `custom_black_list` will keep in the float32 computation type. + amp_list = paddle.static.amp.CustomOpLists( + custom_black_list=['pool2d']) + # 4) The entry of Paddle AMP. + # Enable pure fp16 training by setting `use_pure_fp16` to True. + optimizer = paddle.static.amp.decorate( + optimizer, + amp_list, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True) + # If you don't use the default_startup_program(), you sholud pass + # your defined `startup_program` into `minimize`. + optimizer.minimize(loss) + exe.run(paddle.static.default_startup_program()) + # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). + # If you want to perform the testing process, you should pass `test_program` into `amp_init`. + optimizer.amp_init(place, scope=paddle.static.global_scope()) + + if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: + run_example_code() """ assert self._train_program is not None, \ "Please call the minimize method first." @@ -383,7 +427,7 @@ def decorate(optimizer, Args: optimizer(Optimizer): A common Optimizer. - amp_lists (AutoMixedPrecisionLists): An AutoMixedPrecisionLists object. + amp_lists (CustomOpLists): An CustomOpLists object. init_loss_scaling(float): The initial loss scaling factor. incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. @@ -403,17 +447,70 @@ def decorate(optimizer, An optimizer acting like a normal one but with mixed-precision training enabled. - Examples: - .. code-block:: python + Examples 1: + .. code-block:: python + + # black&white list based strategy example + import paddle + import paddle.static as static + + paddle.enable_static() + + data = static.data(name='X', shape=[None, 1], dtype='float32') + hidden = static.nn.fc(x=data, size=10) + loss = paddle.mean(hidden) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) + + mp_optimizer = static.amp.decorate( + optimizer=optimizer, init_loss_scaling=8.0) - loss = network() - optimizer = fluid.optimizer.Adam(learning_rate=0.001) - - mp_optimizer = fluid.contrib.mixed_precision.decorate( - optimizer=optimizer, init_loss_scaling=8.0) - ops, param_grads = mp_optimizer.minimize(loss) scaled_loss = mp_optimizer.get_scaled_loss() + + Examples 2: + .. code-block:: python + + # pure fp16 training example + import numpy as np + import paddle + import paddle.nn.functional as F + + def run_example_code(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + # 1) Use fp16_guard to control the range of fp16 kernels used. + with paddle.static.amp.fp16_guard(): + bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + pool = F.max_pool2d(bn, kernel_size=2, stride=2) + hidden = paddle.static.nn.fc(pool, size=10) + loss = paddle.mean(hidden) + # 2) Create the optimizer and set `multi_precision` to True. + # Setting `multi_precision` to True can avoid the poor accuracy + # or the slow convergence in a way. + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True) + # 3) These ops in `custom_black_list` will keep in the float32 computation type. + amp_list = paddle.static.amp.CustomOpLists( + custom_black_list=['pool2d']) + # 4) The entry of Paddle AMP. + # Enable pure fp16 training by setting `use_pure_fp16` to True. + optimizer = paddle.static.amp.decorate( + optimizer, + amp_list, + init_loss_scaling=128.0, + use_dynamic_loss_scaling=True, + use_pure_fp16=True) + # If you don't use the default_startup_program(), you sholud pass + # your defined `startup_program` into `minimize`. + optimizer.minimize(loss) + exe.run(paddle.static.default_startup_program()) + # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`). + # If you want to perform the testing process, you should pass `test_program` into `amp_init`. + optimizer.amp_init(place, scope=paddle.static.global_scope()) + + if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0: + run_example_code() """ if amp_lists is None: amp_lists = AutoMixedPrecisionLists() diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index a409595d3ed10b895f4b6dc3ee5a466815a71ad0..1e4286248538636f16f1242afa4a96b1d381dfbd 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -14,7 +14,7 @@ import copy -__all__ = ["AutoMixedPrecisionLists"] +__all__ = ["CustomOpLists", "AutoMixedPrecisionLists"] class AutoMixedPrecisionLists(object): @@ -27,6 +27,7 @@ class AutoMixedPrecisionLists(object): Args: custom_white_list (set): Users' custom white list. custom_black_list (set): Users' custom black list. + custom_black_varnames (set): Users' custom black varibles' names. """ def __init__(self, @@ -284,3 +285,5 @@ unsupported_fp16_list = { 'generate_proposal_labels', 'generate_mask_labels', } + +CustomOpLists = AutoMixedPrecisionLists diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index e02671e219fc936a14bdc56915b628ae93424e4d..f9c3a613c4053a79cb467d752b20f6f4ed3ea4ec 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -282,6 +282,22 @@ def fp16_guard(): As for the pure fp16 training, if users set `use_fp16_guard` to True, only those ops created in the context manager `fp16_guard` will be transformed as float16 type. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + import paddle.nn.functional as F + paddle.enable_static() + data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + + with paddle.static.amp.fp16_guard(): + bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + pool = F.max_pool2d(bn, kernel_size=2, stride=2) + hidden = paddle.static.nn.fc(pool, size=10) + loss = paddle.mean(hidden) """ with framework.name_scope(prefix=_fp16_guard_pattern): yield