decorator.py 28.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
import types
import warnings
17

18
import paddle
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
from paddle.fluid import (
    core,
    default_main_program,
    default_startup_program,
    program_guard,
    unique_name,
)

from .amp_nn import check_finite_and_unscale, update_loss_scaling
from .fp16_lists import AutoMixedPrecisionLists
from .fp16_utils import (
    cast_model_to_fp16,
    cast_parameters_to_fp16,
    rewrite_program,
    update_role_var_grad,
)
35 36


37
class OptimizerWithMixedPrecision:
38
    """
39
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common
Z
Zhen Wang 已提交
40
    optimizer, plus the support of mixed-precision pre-training. The object
41 42 43
    of this class almost has the same behavior as the common optimizer, with the
    methods `minimize()`, `backward()`, `apply_gradients()` implemented.
    Additionally, it enables the MP training automatically, i.e, the creation
44 45 46 47
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
H
huangxu96 已提交
48
        amp_lists (CustomOpLists): An CustomOpLists object.
49 50
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
51
        incr_every_n_steps(int): Increases loss scaling every n consecutive
J
Jie Fang 已提交
52
                                 steps with finite gradients.
53 54
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
                                      accumulated steps with nan or
J
Jie Fang 已提交
55
                                      inf gradients.
56
        incr_ratio(float): The multiplier to use when increasing the loss
J
Jie Fang 已提交
57
                           scaling.
58
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
J
Jie Fang 已提交
59
                           the loss scaling.
60 61 62
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
63

64 65
    """

66 67 68 69 70 71 72 73 74 75 76 77 78
    def __init__(
        self,
        optimizer,
        amp_lists,
        init_loss_scaling,
        use_dynamic_loss_scaling,
        incr_every_n_steps,
        decr_every_n_nan_or_inf,
        incr_ratio,
        decr_ratio,
        use_pure_fp16,
        use_fp16_guard,
    ):
79
        self._optimizer = optimizer
J
Jie Fang 已提交
80
        self._amp_lists = amp_lists
81
        self._param_grads = None
82 83
        self._train_program = None

84
        self._is_distributed = False
85
        self._scaled_loss = None
86 87
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
88
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
89 90
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
91 92 93
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
94
        if self._use_dynamic_loss_scaling:
95 96
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
97 98
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
99 100 101
            self._num_good_steps = None
            self._num_bad_steps = None

102 103 104 105 106 107
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

108
    def get_loss_scaling(self):
109 110 111 112
        """Return the real-time loss scaling factor."""
        assert (
            self._loss_scaling is not None
        ), 'Please call minimize() before calling get_loss_scaling().'
113 114 115 116 117 118 119 120
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

121 122 123
    def _supports_check_nan_inf(self):
        return getattr(self._optimizer, "_supports_check_nan_inf", False)

124
    def _init_amp_var(self):
125
        self._loss_scaling = paddle.static.create_global_var(
126 127 128 129
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
130 131
            persistable=True,
        )
132 133

        if self._use_dynamic_loss_scaling:
134
            self._num_good_steps = paddle.static.create_global_var(
J
Jie Fang 已提交
135 136 137 138
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
139 140
                persistable=True,
            )
141
            self._num_bad_steps = paddle.static.create_global_var(
J
Jie Fang 已提交
142 143 144 145
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
146 147
                persistable=True,
            )
148

149
        # Ensure the data type of learning rate vars is float32 (same as the
150
        # master parameter dtype)
151
        if isinstance(self._optimizer._learning_rate, float):
152 153
            self._optimizer._learning_rate_map[
                default_main_program()
154
            ] = paddle.static.create_global_var(
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
                name=unique_name.generate("learning_rate"),
                shape=[1],
                value=float(self._optimizer._learning_rate),
                dtype='float32',
                persistable=True,
            )

    def backward(
        self,
        loss,
        startup_program=None,
        parameter_list=None,
        no_grad_set=None,
        callbacks=None,
    ):
170
        """
Z
Zhen Wang 已提交
171
        Backward propagation or auto differentiation for gradients' computation.
172 173 174

        Args:
            loss (Variable): The loss Variable to minimize.
175
            startup_program (Program|None): The startup Program for initializing
176 177 178
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
179
            callbacks (list|None): A list of callable objects to run when appending
180 181 182
                                   backward operator for one parameter.

        Returns:
183
            A list of (param, grad), which is a tuple of a parameter and its
184 185
            gradient respectively, and the scaled loss.
        """
186 187 188
        train_program = loss.block.program
        self._train_program = train_program

189 190
        # NOTE(zhiqiu): _float_status is only used for NPU.
        if core.is_compiled_with_npu():
191 192 193
            float_status = paddle.static.data(
                name="float_status", shape=[8], dtype='float32'
            )
194 195
            self._train_program.global_block().append_op(
                type="alloc_float_status",
196 197
                outputs={"FloatStatus": float_status},
            )
198 199 200
            self._train_program.global_block().append_op(
                type="clear_float_status",
                inputs={"FloatStatus": float_status},
201 202
                outputs={"FloatStatusOut": float_status},
            )
203 204 205 206
            self._float_status = float_status
        else:
            self._float_status = None

207
        with program_guard(self._train_program, startup_program):
208 209
            self._init_amp_var()

210 211
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
212 213
                    self._train_program, self._amp_lists, self._use_fp16_guard
                )
214 215 216 217 218 219 220 221 222 223 224 225
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

226 227 228 229 230 231 232
            params_grads = self._optimizer.backward(
                self._scaled_loss,
                startup_program,
                parameter_list,
                no_grad_set,
                callbacks,
            )
233 234
            if self._supports_check_nan_inf():
                self._add_cast_ops_to_startup_program(startup_program)
235
        return params_grads
236

237 238 239
    def _add_cast_ops_to_startup_program(self, startup_program):
        names = list(self._to_fp16_var_names) if self._to_fp16_var_names else []
        names.sort()
240 241 242 243 244
        startup_program = (
            default_startup_program()
            if startup_program is None
            else startup_program
        )
245 246 247 248 249 250 251
        block = startup_program.global_block()
        param_names = [p.name for p in block.all_parameters()]
        for name in names:
            if name not in param_names:
                continue

            tmp = block.create_var(dtype=core.VarDesc.VarType.FP32)
252 253 254 255 256 257 258 259 260 261 262 263
            block.append_op(
                type='assign', inputs={'X': [name]}, outputs={'Out': [tmp]}
            )
            block.append_op(
                type='cast',
                inputs={'X': [tmp]},
                outputs={'Out': [name]},
                attrs={
                    'in_dtype': core.VarDesc.VarType.FP32,
                    'out_dtype': core.VarDesc.VarType.FP16,
                },
            )
264 265
        self._to_fp16_var_names = None

266 267 268
    def amp_init(
        self, place, scope=None, test_program=None, use_fp16_test=False
    ):
269 270
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
271

272
        Args:
273
            place(CUDAPlace): place is used to initialize
274 275 276 277 278
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

H
huangxu96 已提交
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
        Examples:
            .. code-block:: python

                import numpy as np
                import paddle
                import paddle.nn.functional as F
                paddle.enable_static()

                def run_example_code():
                    place = paddle.CUDAPlace(0)
                    exe = paddle.static.Executor(place)
                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                    # 1) Use fp16_guard to control the range of fp16 kernels used.
                    with paddle.static.amp.fp16_guard():
                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                        hidden = paddle.static.nn.fc(pool, size=10)
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
300
                    # or the slow convergence in a way.
H
huangxu96 已提交
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
                        custom_black_list=['pool2d'])
                    # 4) The entry of Paddle AMP.
                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
                    optimizer = paddle.static.amp.decorate(
                        optimizer,
                        amp_list,
                        init_loss_scaling=128.0,
                        use_dynamic_loss_scaling=True,
                        use_pure_fp16=True)
                    # If you don't use the default_startup_program(), you sholud pass
                    # your defined `startup_program` into `minimize`.
                    optimizer.minimize(loss)
                    exe.run(paddle.static.default_startup_program())
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
320

H
huangxu96 已提交
321
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
322
                    run_example_code()
323
        """
324 325 326
        assert (
            self._train_program is not None
        ), "Please call the minimize method first."
327
        if self._use_pure_fp16:
328 329 330
            cast_parameters_to_fp16(
                place, self._train_program, scope, self._to_fp16_var_names
            )
331 332
        if test_program is not None:
            if self._use_pure_fp16:
333 334 335
                cast_model_to_fp16(
                    test_program, self._amp_lists, self._use_fp16_guard
                )
336 337 338
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

339
    def apply_gradients(self, params_grads):
340
        """
341
        Check scaled gradients to determine whether to update loss scaling and update
342
        parameters by their scaled gradients.
343

344
        Args:
345
            params_grads (list): A list of params and scaled grads.
346

347 348 349
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
350

351 352 353 354
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

355 356
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
357 358 359 360
        if (
            not self._use_dynamic_loss_scaling
            and self._init_loss_scaling == 1.0
        ):
361 362
            return self._optimizer.apply_gradients(params_grads)

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
        if self._supports_check_nan_inf():
            self._optimizer._set_scale(self._loss_scaling)
            optimize_ops = self._optimizer.apply_gradients(params_grads)
            found_inf = self._optimizer._found_inf
            self._add_dynamic_loss_scaling(params_grads, found_inf)
            return optimize_ops

        found_inf = self._check_finite_and_unscale(params_grads)
        if self._use_dynamic_loss_scaling:
            self._add_dynamic_loss_scaling(params_grads, found_inf)

        # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
        # With fleet, optimizers are nested and the real optimizer set by user is the inner most one.
        real_optimizer = self._optimizer
        while hasattr(real_optimizer, "inner_opt"):
            real_optimizer = real_optimizer.inner_opt
379 380 381 382
        if isinstance(
            real_optimizer,
            (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW),
        ):
383 384 385
            # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we
            # copy it in advance to avoid multiple time copies.
            with self._train_program._optimized_guard([]):
386
                found_inf = paddle.tensor.creation._memcpy(
387 388
                    found_inf, paddle.CPUPlace()
                )
389 390 391 392 393 394 395
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        elif hasattr(real_optimizer, "_set_auxiliary_var"):
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        optimize_ops = self._optimizer.apply_gradients(params_grads)
        return optimize_ops

    def _split_grads(self, params_grads):
396
        grads = [g for _, g in params_grads]
397 398
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
399 400 401
        assert len(fp32_grads) + len(fp16_grads) == len(
            grads
        ), "Data types of all grads must be either fp16 or fp32."
402
        return grads, fp32_grads, fp16_grads
403

404 405
    def _check_finite_and_unscale(self, params_grads):
        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
406
        found_infs = []
407

408
        if self._is_distributed:
409 410
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
411 412
            if core.is_compiled_with_npu():
                with self._train_program._optimized_guard(grads):
413
                    _, found_inf = check_finite_and_unscale(
414
                        grads,
415 416
                        self._loss_scaling,
                        name="find_infinite_scale",
417 418
                        float_status=self._float_status,
                    )
419
                    found_infs.append(found_inf)
420 421 422 423
            else:
                for p, g in params_grads:
                    with self._train_program._optimized_guard([p, g]):
                        _, found_inf = check_finite_and_unscale(
424 425 426
                            [
                                g,
                            ],
427 428
                            self._loss_scaling,
                            name="find_infinite_scale",
429 430
                            float_status=self._float_status,
                        )
431
                        found_infs.append(found_inf)
432 433 434 435 436 437
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
438
                        name="find_infinite_scale_fp32",
439 440
                        float_status=self._float_status,
                    )
441 442 443 444 445 446
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
447
                        name="find_infinite_scale_fp16",
448 449
                        float_status=self._float_status,
                    )
450 451 452 453
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
454 455 456
                    grads,
                    self._loss_scaling,
                    name="find_infinite_scale",
457 458
                    float_status=self._float_status,
                )
J
Jie Fang 已提交
459

460 461
        if self._is_distributed or self._use_pure_fp16:
            with self._train_program._optimized_guard([]):
462
                all_infs = paddle.concat(found_infs)
463
                found_inf = paddle.any(all_infs)
464

465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
        return found_inf

    def _add_dynamic_loss_scaling(self, params_grads, found_inf):
        if self._supports_check_nan_inf():
            with self._train_program._optimized_guard([]):
                update_loss_scaling(
                    [],
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
480
                    stop_update=self._optimizer._get_stop_update_var(),
481 482
                    name="update_loss_scaling",
                )
483 484 485 486 487 488 489
            return

        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
        if self._use_pure_fp16:
            stop_update = False
            with self._train_program._optimized_guard([]):
                if fp32_grads:
490 491 492 493 494 495 496 497 498 499 500 501 502
                    update_loss_scaling(
                        fp32_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp32",
                    )
503 504
                    stop_update = True
                if fp16_grads:
505 506 507 508 509 510 511 512 513 514 515 516 517
                    update_loss_scaling(
                        fp16_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp16",
                    )
518
        else:
R
Roc 已提交
519
            with self._train_program._optimized_guard([]):
520 521 522 523 524 525 526 527 528 529 530 531
                update_loss_scaling(
                    grads,
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
                    name="update_loss_scaling",
                )
532

533 534 535 536 537 538
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

539 540 541
    def minimize(
        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
    ):
542 543 544 545 546
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
547 548 549 550
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
551 552 553

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
554
            list of scaled parameters and gradients.
555
        """
556

557
        opt_dict = self._optimizer.__class__.__dict__
558 559 560
        if 'minimize' in opt_dict and isinstance(
            opt_dict['minimize'], types.FunctionType
        ):
561 562 563 564
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

565 566 567 568 569 570
        scaled_params_grads = self.backward(
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set,
        )
G
gongweibao 已提交
571

572 573 574
        optimize_ops = self.apply_optimize(
            loss, startup_program, scaled_params_grads
        )
575

G
gongweibao 已提交
576
        return optimize_ops, scaled_params_grads
577 578


579 580 581 582 583 584 585 586 587 588 589 590
def decorate(
    optimizer,
    amp_lists=None,
    init_loss_scaling=2**15,
    incr_every_n_steps=1000,
    decr_every_n_nan_or_inf=2,
    incr_ratio=2.0,
    decr_ratio=0.8,
    use_dynamic_loss_scaling=True,
    use_pure_fp16=False,
    use_fp16_guard=None,
):
591
    """
592 593 594 595
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
H
huangxu96 已提交
596
        amp_lists (CustomOpLists): An CustomOpLists object.
597
        init_loss_scaling(float): The initial loss scaling factor.
598
        incr_every_n_steps(int): Increases loss scaling every n consecutive
J
Jie Fang 已提交
599
                                 steps with finite gradients.
600 601
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n
                                      accumulated steps with nan or
J
Jie Fang 已提交
602
                                      inf gradients.
603
        incr_ratio(float): The multiplier to use when increasing the loss
J
Jie Fang 已提交
604
                           scaling.
605
        decr_ratio(float): The less-than-one-multiplier to use when decreasing
J
Jie Fang 已提交
606
                           the loss scaling.
607
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
608 609 610
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
611 612

    Returns:
613
        An optimizer acting like a normal one but with mixed-precision training
614 615
        enabled.

H
huangxu96 已提交
616
    Examples 1:
617
            .. code-block:: python
H
huangxu96 已提交
618 619 620 621 622 623 624 625 626 627 628 629 630 631

            # black&white list based strategy example
            import paddle
            import paddle.static as static

            paddle.enable_static()

            data = static.data(name='X', shape=[None, 1], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            loss = paddle.mean(hidden)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)

            mp_optimizer = static.amp.decorate(
                    optimizer=optimizer, init_loss_scaling=8.0)
632

G
gongweibao 已提交
633
            ops, param_grads = mp_optimizer.minimize(loss)
634
            scaled_loss = mp_optimizer.get_scaled_loss()
H
huangxu96 已提交
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656

    Examples 2:
        .. code-block:: python

            # pure fp16 training example
            import numpy as np
            import paddle
            import paddle.nn.functional as F

            def run_example_code():
                place = paddle.CUDAPlace(0)
                exe = paddle.static.Executor(place)
                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                # 1) Use fp16_guard to control the range of fp16 kernels used.
                with paddle.static.amp.fp16_guard():
                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                    hidden = paddle.static.nn.fc(pool, size=10)
                    loss = paddle.mean(hidden)
                # 2) Create the optimizer and set `multi_precision` to True.
                # Setting `multi_precision` to True can avoid the poor accuracy
657
                # or the slow convergence in a way.
H
huangxu96 已提交
658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                amp_list = paddle.static.amp.CustomOpLists(
                    custom_black_list=['pool2d'])
                # 4) The entry of Paddle AMP.
                # Enable pure fp16 training by setting `use_pure_fp16` to True.
                optimizer = paddle.static.amp.decorate(
                    optimizer,
                    amp_list,
                    init_loss_scaling=128.0,
                    use_dynamic_loss_scaling=True,
                    use_pure_fp16=True)
                # If you don't use the default_startup_program(), you sholud pass
                # your defined `startup_program` into `minimize`.
                optimizer.minimize(loss)
                exe.run(paddle.static.default_startup_program())
                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                optimizer.amp_init(place, scope=paddle.static.global_scope())
677

H
huangxu96 已提交
678 679
            if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                run_example_code()
680
    """
J
Jie Fang 已提交
681 682
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
683 684 685 686

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
687
    mp_optimizer = OptimizerWithMixedPrecision(
688 689 690 691 692 693 694 695 696 697 698
        optimizer,
        amp_lists,
        init_loss_scaling,
        use_dynamic_loss_scaling,
        incr_every_n_steps,
        decr_every_n_nan_or_inf,
        incr_ratio,
        decr_ratio,
        use_pure_fp16,
        use_fp16_guard,
    )
699 700

    return mp_optimizer