decorator.py 28.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
import types
import warnings
17

18
import paddle
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
from paddle.fluid import (
    core,
    default_main_program,
    default_startup_program,
    layers,
    program_guard,
    unique_name,
)

from .amp_nn import check_finite_and_unscale, update_loss_scaling
from .fp16_lists import AutoMixedPrecisionLists
from .fp16_utils import (
    cast_model_to_fp16,
    cast_parameters_to_fp16,
    rewrite_program,
    update_role_var_grad,
)
36 37 38 39

__all__ = ["decorate"]


40
class OptimizerWithMixedPrecision:
41
    """
42
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common
Z
Zhen Wang 已提交
43
    optimizer, plus the support of mixed-precision pre-training. The object
44 45 46
    of this class almost has the same behavior as the common optimizer, with the
    methods `minimize()`, `backward()`, `apply_gradients()` implemented.
    Additionally, it enables the MP training automatically, i.e, the creation
47 48 49 50
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
H
huangxu96 已提交
51
        amp_lists (CustomOpLists): An CustomOpLists object.
52 53
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
54
        incr_every_n_steps(int): Increases loss scaling every n consecutive
J
Jie Fang 已提交
55
                                 steps with finite gradients.
56 57
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
                                      accumulated steps with nan or
J
Jie Fang 已提交
58
                                      inf gradients.
59
        incr_ratio(float): The multiplier to use when increasing the loss
J
Jie Fang 已提交
60
                           scaling.
61
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
J
Jie Fang 已提交
62
                           the loss scaling.
63 64 65
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
66

67 68
    """

69 70 71 72 73 74 75 76 77 78 79 80 81
    def __init__(
        self,
        optimizer,
        amp_lists,
        init_loss_scaling,
        use_dynamic_loss_scaling,
        incr_every_n_steps,
        decr_every_n_nan_or_inf,
        incr_ratio,
        decr_ratio,
        use_pure_fp16,
        use_fp16_guard,
    ):
82
        self._optimizer = optimizer
J
Jie Fang 已提交
83
        self._amp_lists = amp_lists
84
        self._param_grads = None
85 86
        self._train_program = None

87
        self._is_distributed = False
88
        self._scaled_loss = None
89 90
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
91
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
92 93
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
94 95 96
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
97
        if self._use_dynamic_loss_scaling:
98 99
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
100 101
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
102 103 104
            self._num_good_steps = None
            self._num_bad_steps = None

105 106 107 108 109 110
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

111
    def get_loss_scaling(self):
112 113 114 115
        """Return the real-time loss scaling factor."""
        assert (
            self._loss_scaling is not None
        ), 'Please call minimize() before calling get_loss_scaling().'
116 117 118 119 120 121 122 123
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

124 125 126
    def _supports_check_nan_inf(self):
        return getattr(self._optimizer, "_supports_check_nan_inf", False)

127
    def _init_amp_var(self):
128
        self._loss_scaling = paddle.static.create_global_var(
129 130 131 132
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
133 134
            persistable=True,
        )
135 136

        if self._use_dynamic_loss_scaling:
137
            self._num_good_steps = paddle.static.create_global_var(
J
Jie Fang 已提交
138 139 140 141
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
142 143
                persistable=True,
            )
144
            self._num_bad_steps = paddle.static.create_global_var(
J
Jie Fang 已提交
145 146 147 148
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
149 150
                persistable=True,
            )
151

152
        # Ensure the data type of learning rate vars is float32 (same as the
153
        # master parameter dtype)
154
        if isinstance(self._optimizer._learning_rate, float):
155 156
            self._optimizer._learning_rate_map[
                default_main_program()
157
            ] = paddle.static.create_global_var(
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
                name=unique_name.generate("learning_rate"),
                shape=[1],
                value=float(self._optimizer._learning_rate),
                dtype='float32',
                persistable=True,
            )

    def backward(
        self,
        loss,
        startup_program=None,
        parameter_list=None,
        no_grad_set=None,
        callbacks=None,
    ):
173
        """
Z
Zhen Wang 已提交
174
        Backward propagation or auto differentiation for gradients' computation.
175 176 177

        Args:
            loss (Variable): The loss Variable to minimize.
178
            startup_program (Program|None): The startup Program for initializing
179 180 181
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
182
            callbacks (list|None): A list of callable objects to run when appending
183 184 185
                                   backward operator for one parameter.

        Returns:
186
            A list of (param, grad), which is a tuple of a parameter and its
187 188
            gradient respectively, and the scaled loss.
        """
189 190 191
        train_program = loss.block.program
        self._train_program = train_program

192 193
        # NOTE(zhiqiu): _float_status is only used for NPU.
        if core.is_compiled_with_npu():
194 195 196
            float_status = paddle.static.data(
                name="float_status", shape=[8], dtype='float32'
            )
197 198
            self._train_program.global_block().append_op(
                type="alloc_float_status",
199 200
                outputs={"FloatStatus": float_status},
            )
201 202 203
            self._train_program.global_block().append_op(
                type="clear_float_status",
                inputs={"FloatStatus": float_status},
204 205
                outputs={"FloatStatusOut": float_status},
            )
206 207 208 209
            self._float_status = float_status
        else:
            self._float_status = None

210
        with program_guard(self._train_program, startup_program):
211 212
            self._init_amp_var()

213 214
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
215 216
                    self._train_program, self._amp_lists, self._use_fp16_guard
                )
217 218 219 220 221 222 223 224 225 226 227 228
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

229 230 231 232 233 234 235
            params_grads = self._optimizer.backward(
                self._scaled_loss,
                startup_program,
                parameter_list,
                no_grad_set,
                callbacks,
            )
236 237
            if self._supports_check_nan_inf():
                self._add_cast_ops_to_startup_program(startup_program)
238
        return params_grads
239

240 241 242
    def _add_cast_ops_to_startup_program(self, startup_program):
        names = list(self._to_fp16_var_names) if self._to_fp16_var_names else []
        names.sort()
243 244 245 246 247
        startup_program = (
            default_startup_program()
            if startup_program is None
            else startup_program
        )
248 249 250 251 252 253 254
        block = startup_program.global_block()
        param_names = [p.name for p in block.all_parameters()]
        for name in names:
            if name not in param_names:
                continue

            tmp = block.create_var(dtype=core.VarDesc.VarType.FP32)
255 256 257 258 259 260 261 262 263 264 265 266
            block.append_op(
                type='assign', inputs={'X': [name]}, outputs={'Out': [tmp]}
            )
            block.append_op(
                type='cast',
                inputs={'X': [tmp]},
                outputs={'Out': [name]},
                attrs={
                    'in_dtype': core.VarDesc.VarType.FP32,
                    'out_dtype': core.VarDesc.VarType.FP16,
                },
            )
267 268
        self._to_fp16_var_names = None

269 270 271
    def amp_init(
        self, place, scope=None, test_program=None, use_fp16_test=False
    ):
272 273
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
274

275
        Args:
276
            place(CUDAPlace): place is used to initialize
277 278 279 280 281
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

H
huangxu96 已提交
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
        Examples:
            .. code-block:: python

                import numpy as np
                import paddle
                import paddle.nn.functional as F
                paddle.enable_static()

                def run_example_code():
                    place = paddle.CUDAPlace(0)
                    exe = paddle.static.Executor(place)
                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                    # 1) Use fp16_guard to control the range of fp16 kernels used.
                    with paddle.static.amp.fp16_guard():
                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                        hidden = paddle.static.nn.fc(pool, size=10)
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
303
                    # or the slow convergence in a way.
H
huangxu96 已提交
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
                        custom_black_list=['pool2d'])
                    # 4) The entry of Paddle AMP.
                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
                    optimizer = paddle.static.amp.decorate(
                        optimizer,
                        amp_list,
                        init_loss_scaling=128.0,
                        use_dynamic_loss_scaling=True,
                        use_pure_fp16=True)
                    # If you don't use the default_startup_program(), you sholud pass
                    # your defined `startup_program` into `minimize`.
                    optimizer.minimize(loss)
                    exe.run(paddle.static.default_startup_program())
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
323

H
huangxu96 已提交
324
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
325
                    run_example_code()
326
        """
327 328 329
        assert (
            self._train_program is not None
        ), "Please call the minimize method first."
330
        if self._use_pure_fp16:
331 332 333
            cast_parameters_to_fp16(
                place, self._train_program, scope, self._to_fp16_var_names
            )
334 335
        if test_program is not None:
            if self._use_pure_fp16:
336 337 338
                cast_model_to_fp16(
                    test_program, self._amp_lists, self._use_fp16_guard
                )
339 340 341
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

342
    def apply_gradients(self, params_grads):
343
        """
344
        Check scaled gradients to determine whether to update loss scaling and update
345
        parameters by their scaled gradients.
346

347
        Args:
348
            params_grads (list): A list of params and scaled grads.
349

350 351 352
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
353

354 355 356 357
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

358 359
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
360 361 362 363
        if (
            not self._use_dynamic_loss_scaling
            and self._init_loss_scaling == 1.0
        ):
364 365
            return self._optimizer.apply_gradients(params_grads)

366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
        if self._supports_check_nan_inf():
            self._optimizer._set_scale(self._loss_scaling)
            optimize_ops = self._optimizer.apply_gradients(params_grads)
            found_inf = self._optimizer._found_inf
            self._add_dynamic_loss_scaling(params_grads, found_inf)
            return optimize_ops

        found_inf = self._check_finite_and_unscale(params_grads)
        if self._use_dynamic_loss_scaling:
            self._add_dynamic_loss_scaling(params_grads, found_inf)

        # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
        # With fleet, optimizers are nested and the real optimizer set by user is the inner most one.
        real_optimizer = self._optimizer
        while hasattr(real_optimizer, "inner_opt"):
            real_optimizer = real_optimizer.inner_opt
382 383 384 385
        if isinstance(
            real_optimizer,
            (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW),
        ):
386 387 388
            # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we
            # copy it in advance to avoid multiple time copies.
            with self._train_program._optimized_guard([]):
389
                found_inf = paddle.tensor.creation._memcpy(
390 391
                    found_inf, paddle.CPUPlace()
                )
392 393 394 395 396 397 398
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        elif hasattr(real_optimizer, "_set_auxiliary_var"):
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        optimize_ops = self._optimizer.apply_gradients(params_grads)
        return optimize_ops

    def _split_grads(self, params_grads):
399
        grads = [g for _, g in params_grads]
400 401
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
402 403 404
        assert len(fp32_grads) + len(fp16_grads) == len(
            grads
        ), "Data types of all grads must be either fp16 or fp32."
405
        return grads, fp32_grads, fp16_grads
406

407 408
    def _check_finite_and_unscale(self, params_grads):
        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
409
        found_infs = []
410

411
        if self._is_distributed:
412 413
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
414 415
            if core.is_compiled_with_npu():
                with self._train_program._optimized_guard(grads):
416
                    _, found_inf = check_finite_and_unscale(
417
                        grads,
418 419
                        self._loss_scaling,
                        name="find_infinite_scale",
420 421
                        float_status=self._float_status,
                    )
422
                    found_infs.append(found_inf)
423 424 425 426
            else:
                for p, g in params_grads:
                    with self._train_program._optimized_guard([p, g]):
                        _, found_inf = check_finite_and_unscale(
427 428 429
                            [
                                g,
                            ],
430 431
                            self._loss_scaling,
                            name="find_infinite_scale",
432 433
                            float_status=self._float_status,
                        )
434
                        found_infs.append(found_inf)
435 436 437 438 439 440
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
441
                        name="find_infinite_scale_fp32",
442 443
                        float_status=self._float_status,
                    )
444 445 446 447 448 449
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
450
                        name="find_infinite_scale_fp16",
451 452
                        float_status=self._float_status,
                    )
453 454 455 456
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
457 458 459
                    grads,
                    self._loss_scaling,
                    name="find_infinite_scale",
460 461
                    float_status=self._float_status,
                )
J
Jie Fang 已提交
462

463 464 465
        if self._is_distributed or self._use_pure_fp16:
            with self._train_program._optimized_guard([]):
                all_infs = layers.concat(found_infs)
466
                found_inf = paddle.any(all_infs)
467

468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
        return found_inf

    def _add_dynamic_loss_scaling(self, params_grads, found_inf):
        if self._supports_check_nan_inf():
            with self._train_program._optimized_guard([]):
                update_loss_scaling(
                    [],
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
483
                    stop_update=self._optimizer._get_stop_update_var(),
484 485
                    name="update_loss_scaling",
                )
486 487 488 489 490 491 492
            return

        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
        if self._use_pure_fp16:
            stop_update = False
            with self._train_program._optimized_guard([]):
                if fp32_grads:
493 494 495 496 497 498 499 500 501 502 503 504 505
                    update_loss_scaling(
                        fp32_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp32",
                    )
506 507
                    stop_update = True
                if fp16_grads:
508 509 510 511 512 513 514 515 516 517 518 519 520
                    update_loss_scaling(
                        fp16_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp16",
                    )
521
        else:
R
Roc 已提交
522
            with self._train_program._optimized_guard([]):
523 524 525 526 527 528 529 530 531 532 533 534
                update_loss_scaling(
                    grads,
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
                    name="update_loss_scaling",
                )
535

536 537 538 539 540 541
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

542 543 544
    def minimize(
        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
    ):
545 546 547 548 549
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
550 551 552 553
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
554 555 556

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
557
            list of scaled parameters and gradients.
558
        """
559

560
        opt_dict = self._optimizer.__class__.__dict__
561 562 563
        if 'minimize' in opt_dict and isinstance(
            opt_dict['minimize'], types.FunctionType
        ):
564 565 566 567
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

568 569 570 571 572 573
        scaled_params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set,
        )
G
gongweibao 已提交
574

575 576 577
        optimize_ops = self.apply_optimize(
            loss, startup_program, scaled_params_grads
        )
578

G
gongweibao 已提交
579
        return optimize_ops, scaled_params_grads
580 581


582 583 584 585 586 587 588 589 590 591 592 593
def decorate(
    optimizer,
    amp_lists=None,
    init_loss_scaling=2**15,
    incr_every_n_steps=1000,
    decr_every_n_nan_or_inf=2,
    incr_ratio=2.0,
    decr_ratio=0.8,
    use_dynamic_loss_scaling=True,
    use_pure_fp16=False,
    use_fp16_guard=None,
):
594
    """
595 596 597 598
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
H
huangxu96 已提交
599
        amp_lists (CustomOpLists): An CustomOpLists object.
600
        init_loss_scaling(float): The initial loss scaling factor.
601
        incr_every_n_steps(int): Increases loss scaling every n consecutive
J
Jie Fang 已提交
602
                                 steps with finite gradients.
603 604
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
                                      accumulated steps with nan or
J
Jie Fang 已提交
605
                                      inf gradients.
606
        incr_ratio(float): The multiplier to use when increasing the loss
J
Jie Fang 已提交
607
                           scaling.
608
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
J
Jie Fang 已提交
609
                           the loss scaling.
610
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
611 612 613
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
614 615

    Returns:
616
        An optimizer acting like a normal one but with mixed-precision training
617 618
        enabled.

H
huangxu96 已提交
619
    Examples 1:
620
            .. code-block:: python
H
huangxu96 已提交
621 622 623 624 625 626 627 628 629 630 631 632 633 634

            # black&white list based strategy example
            import paddle
            import paddle.static as static

            paddle.enable_static()

            data = static.data(name='X', shape=[None, 1], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            loss = paddle.mean(hidden)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)

            mp_optimizer = static.amp.decorate(
                    optimizer=optimizer, init_loss_scaling=8.0)
635

G
gongweibao 已提交
636
            ops, param_grads = mp_optimizer.minimize(loss)
637
            scaled_loss = mp_optimizer.get_scaled_loss()
H
huangxu96 已提交
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659

    Examples 2:
        .. code-block:: python

            # pure fp16 training example
            import numpy as np
            import paddle
            import paddle.nn.functional as F

            def run_example_code():
                place = paddle.CUDAPlace(0)
                exe = paddle.static.Executor(place)
                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                # 1) Use fp16_guard to control the range of fp16 kernels used.
                with paddle.static.amp.fp16_guard():
                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                    hidden = paddle.static.nn.fc(pool, size=10)
                    loss = paddle.mean(hidden)
                # 2) Create the optimizer and set `multi_precision` to True.
                # Setting `multi_precision` to True can avoid the poor accuracy
660
                # or the slow convergence in a way.
H
huangxu96 已提交
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                amp_list = paddle.static.amp.CustomOpLists(
                    custom_black_list=['pool2d'])
                # 4) The entry of Paddle AMP.
                # Enable pure fp16 training by setting `use_pure_fp16` to True.
                optimizer = paddle.static.amp.decorate(
                    optimizer,
                    amp_list,
                    init_loss_scaling=128.0,
                    use_dynamic_loss_scaling=True,
                    use_pure_fp16=True)
                # If you don't use the default_startup_program(), you sholud pass
                # your defined `startup_program` into `minimize`.
                optimizer.minimize(loss)
                exe.run(paddle.static.default_startup_program())
                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                optimizer.amp_init(place, scope=paddle.static.global_scope())
680

H
huangxu96 已提交
681 682
            if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                run_example_code()
683
    """
J
Jie Fang 已提交
684 685
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
686 687 688 689

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
690
    mp_optimizer = OptimizerWithMixedPrecision(
691 692 693 694 695 696 697 698 699 700 701
        optimizer,
        amp_lists,
        init_loss_scaling,
        use_dynamic_loss_scaling,
        incr_every_n_steps,
        decr_every_n_nan_or_inf,
        incr_ratio,
        decr_ratio,
        use_pure_fp16,
        use_fp16_guard,
    )
702 703

    return mp_optimizer