decorator.py 28.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
import types
import warnings
17

18
import paddle
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
from paddle.fluid import (
    core,
    default_main_program,
    default_startup_program,
    layers,
    program_guard,
    unique_name,
)

from .amp_nn import check_finite_and_unscale, update_loss_scaling
from .fp16_lists import AutoMixedPrecisionLists
from .fp16_utils import (
    cast_model_to_fp16,
    cast_parameters_to_fp16,
    rewrite_program,
    update_role_var_grad,
)
36 37


38
class OptimizerWithMixedPrecision:
39
    """
40
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common
Z
Zhen Wang 已提交
41
    optimizer, plus the support of mixed-precision pre-training. The object
42 43 44
    of this class almost has the same behavior as the common optimizer, with the
    methods `minimize()`, `backward()`, `apply_gradients()` implemented.
    Additionally, it enables the MP training automatically, i.e, the creation
45 46 47 48
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
H
huangxu96 已提交
49
        amp_lists (CustomOpLists): An CustomOpLists object.
50 51
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
52
        incr_every_n_steps(int): Increases loss scaling every n consecutive
J
Jie Fang 已提交
53
                                 steps with finite gradients.
54 55
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
                                      accumulated steps with nan or
J
Jie Fang 已提交
56
                                      inf gradients.
57
        incr_ratio(float): The multiplier to use when increasing the loss
J
Jie Fang 已提交
58
                           scaling.
59
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
J
Jie Fang 已提交
60
                           the loss scaling.
61 62 63
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
64

65 66
    """

67 68 69 70 71 72 73 74 75 76 77 78 79
    def __init__(
        self,
        optimizer,
        amp_lists,
        init_loss_scaling,
        use_dynamic_loss_scaling,
        incr_every_n_steps,
        decr_every_n_nan_or_inf,
        incr_ratio,
        decr_ratio,
        use_pure_fp16,
        use_fp16_guard,
    ):
80
        self._optimizer = optimizer
J
Jie Fang 已提交
81
        self._amp_lists = amp_lists
82
        self._param_grads = None
83 84
        self._train_program = None

85
        self._is_distributed = False
86
        self._scaled_loss = None
87 88
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
89
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
90 91
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
92 93 94
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
95
        if self._use_dynamic_loss_scaling:
96 97
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
98 99
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
100 101 102
            self._num_good_steps = None
            self._num_bad_steps = None

103 104 105 106 107 108
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

109
    def get_loss_scaling(self):
110 111 112 113
        """Return the real-time loss scaling factor."""
        assert (
            self._loss_scaling is not None
        ), 'Please call minimize() before calling get_loss_scaling().'
114 115 116 117 118 119 120 121
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

122 123 124
    def _supports_check_nan_inf(self):
        return getattr(self._optimizer, "_supports_check_nan_inf", False)

125
    def _init_amp_var(self):
126
        self._loss_scaling = paddle.static.create_global_var(
127 128 129 130
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
131 132
            persistable=True,
        )
133 134

        if self._use_dynamic_loss_scaling:
135
            self._num_good_steps = paddle.static.create_global_var(
J
Jie Fang 已提交
136 137 138 139
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
140 141
                persistable=True,
            )
142
            self._num_bad_steps = paddle.static.create_global_var(
J
Jie Fang 已提交
143 144 145 146
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
147 148
                persistable=True,
            )
149

150
        # Ensure the data type of learning rate vars is float32 (same as the
151
        # master parameter dtype)
152
        if isinstance(self._optimizer._learning_rate, float):
153 154
            self._optimizer._learning_rate_map[
                default_main_program()
155
            ] = paddle.static.create_global_var(
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
                name=unique_name.generate("learning_rate"),
                shape=[1],
                value=float(self._optimizer._learning_rate),
                dtype='float32',
                persistable=True,
            )

    def backward(
        self,
        loss,
        startup_program=None,
        parameter_list=None,
        no_grad_set=None,
        callbacks=None,
    ):
171
        """
Z
Zhen Wang 已提交
172
        Backward propagation or auto differentiation for gradients' computation.
173 174 175

        Args:
            loss (Variable): The loss Variable to minimize.
176
            startup_program (Program|None): The startup Program for initializing
177 178 179
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
180
            callbacks (list|None): A list of callable objects to run when appending
181 182 183
                                   backward operator for one parameter.

        Returns:
184
            A list of (param, grad), which is a tuple of a parameter and its
185 186
            gradient respectively, and the scaled loss.
        """
187 188 189
        train_program = loss.block.program
        self._train_program = train_program

190 191
        # NOTE(zhiqiu): _float_status is only used for NPU.
        if core.is_compiled_with_npu():
192 193 194
            float_status = paddle.static.data(
                name="float_status", shape=[8], dtype='float32'
            )
195 196
            self._train_program.global_block().append_op(
                type="alloc_float_status",
197 198
                outputs={"FloatStatus": float_status},
            )
199 200 201
            self._train_program.global_block().append_op(
                type="clear_float_status",
                inputs={"FloatStatus": float_status},
202 203
                outputs={"FloatStatusOut": float_status},
            )
204 205 206 207
            self._float_status = float_status
        else:
            self._float_status = None

208
        with program_guard(self._train_program, startup_program):
209 210
            self._init_amp_var()

211 212
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
213 214
                    self._train_program, self._amp_lists, self._use_fp16_guard
                )
215 216 217 218 219 220 221 222 223 224 225 226
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

227 228 229 230 231 232 233
            params_grads = self._optimizer.backward(
                self._scaled_loss,
                startup_program,
                parameter_list,
                no_grad_set,
                callbacks,
            )
234 235
            if self._supports_check_nan_inf():
                self._add_cast_ops_to_startup_program(startup_program)
236
        return params_grads
237

238 239 240
    def _add_cast_ops_to_startup_program(self, startup_program):
        names = list(self._to_fp16_var_names) if self._to_fp16_var_names else []
        names.sort()
241 242 243 244 245
        startup_program = (
            default_startup_program()
            if startup_program is None
            else startup_program
        )
246 247 248 249 250 251 252
        block = startup_program.global_block()
        param_names = [p.name for p in block.all_parameters()]
        for name in names:
            if name not in param_names:
                continue

            tmp = block.create_var(dtype=core.VarDesc.VarType.FP32)
253 254 255 256 257 258 259 260 261 262 263 264
            block.append_op(
                type='assign', inputs={'X': [name]}, outputs={'Out': [tmp]}
            )
            block.append_op(
                type='cast',
                inputs={'X': [tmp]},
                outputs={'Out': [name]},
                attrs={
                    'in_dtype': core.VarDesc.VarType.FP32,
                    'out_dtype': core.VarDesc.VarType.FP16,
                },
            )
265 266
        self._to_fp16_var_names = None

267 268 269
    def amp_init(
        self, place, scope=None, test_program=None, use_fp16_test=False
    ):
270 271
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
272

273
        Args:
274
            place(CUDAPlace): place is used to initialize
275 276 277 278 279
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

H
huangxu96 已提交
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
        Examples:
            .. code-block:: python

                import numpy as np
                import paddle
                import paddle.nn.functional as F
                paddle.enable_static()

                def run_example_code():
                    place = paddle.CUDAPlace(0)
                    exe = paddle.static.Executor(place)
                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                    # 1) Use fp16_guard to control the range of fp16 kernels used.
                    with paddle.static.amp.fp16_guard():
                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                        hidden = paddle.static.nn.fc(pool, size=10)
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
301
                    # or the slow convergence in a way.
H
huangxu96 已提交
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
                        custom_black_list=['pool2d'])
                    # 4) The entry of Paddle AMP.
                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
                    optimizer = paddle.static.amp.decorate(
                        optimizer,
                        amp_list,
                        init_loss_scaling=128.0,
                        use_dynamic_loss_scaling=True,
                        use_pure_fp16=True)
                    # If you don't use the default_startup_program(), you sholud pass
                    # your defined `startup_program` into `minimize`.
                    optimizer.minimize(loss)
                    exe.run(paddle.static.default_startup_program())
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
321

H
huangxu96 已提交
322
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
323
                    run_example_code()
324
        """
325 326 327
        assert (
            self._train_program is not None
        ), "Please call the minimize method first."
328
        if self._use_pure_fp16:
329 330 331
            cast_parameters_to_fp16(
                place, self._train_program, scope, self._to_fp16_var_names
            )
332 333
        if test_program is not None:
            if self._use_pure_fp16:
334 335 336
                cast_model_to_fp16(
                    test_program, self._amp_lists, self._use_fp16_guard
                )
337 338 339
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

340
    def apply_gradients(self, params_grads):
341
        """
342
        Check scaled gradients to determine whether to update loss scaling and update
343
        parameters by their scaled gradients.
344

345
        Args:
346
            params_grads (list): A list of params and scaled grads.
347

348 349 350
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
351

352 353 354 355
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

356 357
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
358 359 360 361
        if (
            not self._use_dynamic_loss_scaling
            and self._init_loss_scaling == 1.0
        ):
362 363
            return self._optimizer.apply_gradients(params_grads)

364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
        if self._supports_check_nan_inf():
            self._optimizer._set_scale(self._loss_scaling)
            optimize_ops = self._optimizer.apply_gradients(params_grads)
            found_inf = self._optimizer._found_inf
            self._add_dynamic_loss_scaling(params_grads, found_inf)
            return optimize_ops

        found_inf = self._check_finite_and_unscale(params_grads)
        if self._use_dynamic_loss_scaling:
            self._add_dynamic_loss_scaling(params_grads, found_inf)

        # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
        # With fleet, optimizers are nested and the real optimizer set by user is the inner most one.
        real_optimizer = self._optimizer
        while hasattr(real_optimizer, "inner_opt"):
            real_optimizer = real_optimizer.inner_opt
380 381 382 383
        if isinstance(
            real_optimizer,
            (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW),
        ):
384 385 386
            # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we
            # copy it in advance to avoid multiple time copies.
            with self._train_program._optimized_guard([]):
387
                found_inf = paddle.tensor.creation._memcpy(
388 389
                    found_inf, paddle.CPUPlace()
                )
390 391 392 393 394 395 396
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        elif hasattr(real_optimizer, "_set_auxiliary_var"):
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        optimize_ops = self._optimizer.apply_gradients(params_grads)
        return optimize_ops

    def _split_grads(self, params_grads):
397
        grads = [g for _, g in params_grads]
398 399
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
400 401 402
        assert len(fp32_grads) + len(fp16_grads) == len(
            grads
        ), "Data types of all grads must be either fp16 or fp32."
403
        return grads, fp32_grads, fp16_grads
404

405 406
    def _check_finite_and_unscale(self, params_grads):
        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
407
        found_infs = []
408

409
        if self._is_distributed:
410 411
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
412 413
            if core.is_compiled_with_npu():
                with self._train_program._optimized_guard(grads):
414
                    _, found_inf = check_finite_and_unscale(
415
                        grads,
416 417
                        self._loss_scaling,
                        name="find_infinite_scale",
418 419
                        float_status=self._float_status,
                    )
420
                    found_infs.append(found_inf)
421 422 423 424
            else:
                for p, g in params_grads:
                    with self._train_program._optimized_guard([p, g]):
                        _, found_inf = check_finite_and_unscale(
425 426 427
                            [
                                g,
                            ],
428 429
                            self._loss_scaling,
                            name="find_infinite_scale",
430 431
                            float_status=self._float_status,
                        )
432
                        found_infs.append(found_inf)
433 434 435 436 437 438
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
439
                        name="find_infinite_scale_fp32",
440 441
                        float_status=self._float_status,
                    )
442 443 444 445 446 447
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
448
                        name="find_infinite_scale_fp16",
449 450
                        float_status=self._float_status,
                    )
451 452 453 454
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
455 456 457
                    grads,
                    self._loss_scaling,
                    name="find_infinite_scale",
458 459
                    float_status=self._float_status,
                )
J
Jie Fang 已提交
460

461 462 463
        if self._is_distributed or self._use_pure_fp16:
            with self._train_program._optimized_guard([]):
                all_infs = layers.concat(found_infs)
464
                found_inf = paddle.any(all_infs)
465

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
        return found_inf

    def _add_dynamic_loss_scaling(self, params_grads, found_inf):
        if self._supports_check_nan_inf():
            with self._train_program._optimized_guard([]):
                update_loss_scaling(
                    [],
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
481
                    stop_update=self._optimizer._get_stop_update_var(),
482 483
                    name="update_loss_scaling",
                )
484 485 486 487 488 489 490
            return

        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
        if self._use_pure_fp16:
            stop_update = False
            with self._train_program._optimized_guard([]):
                if fp32_grads:
491 492 493 494 495 496 497 498 499 500 501 502 503
                    update_loss_scaling(
                        fp32_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp32",
                    )
504 505
                    stop_update = True
                if fp16_grads:
506 507 508 509 510 511 512 513 514 515 516 517 518
                    update_loss_scaling(
                        fp16_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp16",
                    )
519
        else:
R
Roc 已提交
520
            with self._train_program._optimized_guard([]):
521 522 523 524 525 526 527 528 529 530 531 532
                update_loss_scaling(
                    grads,
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
                    name="update_loss_scaling",
                )
533

534 535 536 537 538 539
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

540 541 542
    def minimize(
        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
    ):
543 544 545 546 547
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
548 549 550 551
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
552 553 554

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
555
            list of scaled parameters and gradients.
556
        """
557

558
        opt_dict = self._optimizer.__class__.__dict__
559 560 561
        if 'minimize' in opt_dict and isinstance(
            opt_dict['minimize'], types.FunctionType
        ):
562 563 564 565
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

566 567 568 569 570 571
        scaled_params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set,
        )
G
gongweibao 已提交
572

573 574 575
        optimize_ops = self.apply_optimize(
            loss, startup_program, scaled_params_grads
        )
576

G
gongweibao 已提交
577
        return optimize_ops, scaled_params_grads
578 579


580 581 582 583 584 585 586 587 588 589 590 591
def decorate(
    optimizer,
    amp_lists=None,
    init_loss_scaling=2**15,
    incr_every_n_steps=1000,
    decr_every_n_nan_or_inf=2,
    incr_ratio=2.0,
    decr_ratio=0.8,
    use_dynamic_loss_scaling=True,
    use_pure_fp16=False,
    use_fp16_guard=None,
):
592
    """
593 594 595 596
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
H
huangxu96 已提交
597
        amp_lists (CustomOpLists): An CustomOpLists object.
598
        init_loss_scaling(float): The initial loss scaling factor.
599
        incr_every_n_steps(int): Increases loss scaling every n consecutive
J
Jie Fang 已提交
600
                                 steps with finite gradients.
601 602
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
                                      accumulated steps with nan or
J
Jie Fang 已提交
603
                                      inf gradients.
604
        incr_ratio(float): The multiplier to use when increasing the loss
J
Jie Fang 已提交
605
                           scaling.
606
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
J
Jie Fang 已提交
607
                           the loss scaling.
608
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
609 610 611
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
612 613

    Returns:
614
        An optimizer acting like a normal one but with mixed-precision training
615 616
        enabled.

H
huangxu96 已提交
617
    Examples 1:
618
            .. code-block:: python
H
huangxu96 已提交
619 620 621 622 623 624 625 626 627 628 629 630 631 632

            # black&white list based strategy example
            import paddle
            import paddle.static as static

            paddle.enable_static()

            data = static.data(name='X', shape=[None, 1], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            loss = paddle.mean(hidden)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)

            mp_optimizer = static.amp.decorate(
                    optimizer=optimizer, init_loss_scaling=8.0)
633

G
gongweibao 已提交
634
            ops, param_grads = mp_optimizer.minimize(loss)
635
            scaled_loss = mp_optimizer.get_scaled_loss()
H
huangxu96 已提交
636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657

    Examples 2:
        .. code-block:: python

            # pure fp16 training example
            import numpy as np
            import paddle
            import paddle.nn.functional as F

            def run_example_code():
                place = paddle.CUDAPlace(0)
                exe = paddle.static.Executor(place)
                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                # 1) Use fp16_guard to control the range of fp16 kernels used.
                with paddle.static.amp.fp16_guard():
                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                    hidden = paddle.static.nn.fc(pool, size=10)
                    loss = paddle.mean(hidden)
                # 2) Create the optimizer and set `multi_precision` to True.
                # Setting `multi_precision` to True can avoid the poor accuracy
658
                # or the slow convergence in a way.
H
huangxu96 已提交
659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                amp_list = paddle.static.amp.CustomOpLists(
                    custom_black_list=['pool2d'])
                # 4) The entry of Paddle AMP.
                # Enable pure fp16 training by setting `use_pure_fp16` to True.
                optimizer = paddle.static.amp.decorate(
                    optimizer,
                    amp_list,
                    init_loss_scaling=128.0,
                    use_dynamic_loss_scaling=True,
                    use_pure_fp16=True)
                # If you don't use the default_startup_program(), you sholud pass
                # your defined `startup_program` into `minimize`.
                optimizer.minimize(loss)
                exe.run(paddle.static.default_startup_program())
                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                optimizer.amp_init(place, scope=paddle.static.global_scope())
678

H
huangxu96 已提交
679 680
            if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                run_example_code()
681
    """
J
Jie Fang 已提交
682 683
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
684 685 686 687

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
688
    mp_optimizer = OptimizerWithMixedPrecision(
689 690 691 692 693 694 695 696 697 698 699
        optimizer,
        amp_lists,
        init_loss_scaling,
        use_dynamic_loss_scaling,
        incr_every_n_steps,
        decr_every_n_nan_or_inf,
        incr_ratio,
        decr_ratio,
        use_pure_fp16,
        use_fp16_guard,
    )
700 701

    return mp_optimizer