decorator.py 28.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from ... import core
16 17
from ... import default_main_program
from ... import default_startup_program
18
from ... import framework
19
from ... import layers
20
from ... import program_guard
21
from ... import unique_name
22
from . import fp16_utils
23
from .fp16_utils import rewrite_program
24 25
from .fp16_utils import cast_model_to_fp16
from .fp16_utils import cast_parameters_to_fp16
26
from .fp16_utils import update_role_var_grad
J
Jie Fang 已提交
27
from .fp16_lists import AutoMixedPrecisionLists
28 29
from .amp_nn import check_finite_and_unscale
from .amp_nn import update_loss_scaling
30 31
import types
import warnings
32
import paddle
33 34 35 36

__all__ = ["decorate"]


Z
Zhen Wang 已提交
37
class OptimizerWithMixedPrecision(object):
38 39
    """
    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
Z
Zhen Wang 已提交
40
    optimizer, plus the support of mixed-precision pre-training. The object
41 42 43 44 45 46 47
    of this class almost has the same behavior as the common optimizer, with the 
    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
    Additionally, it enables the MP training automatically, i.e, the creation 
    and maintenance of master parameters, scaling of loss, etc.

    Args:
        optimizer (Optimizer): A common Optimizer object.
H
huangxu96 已提交
48
        amp_lists (CustomOpLists): An CustomOpLists object.
49 50
        init_loss_scaling (float): The initial loss scaling factor.
        use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
J
Jie Fang 已提交
51 52 53 54 55 56 57 58 59
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
60 61 62
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value is equal to `use_pure_fp16`.
J
Jie Fang 已提交
63

64 65
    """

J
Jie Fang 已提交
66 67
    def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 use_dynamic_loss_scaling, incr_every_n_steps,
68 69
                 decr_every_n_nan_or_inf, incr_ratio, decr_ratio, use_pure_fp16,
                 use_fp16_guard):
70
        self._optimizer = optimizer
J
Jie Fang 已提交
71
        self._amp_lists = amp_lists
72
        self._param_grads = None
73 74
        self._train_program = None

75
        self._is_distributed = False
76
        self._scaled_loss = None
77 78
        self._loss_scaling = None
        self._init_loss_scaling = init_loss_scaling
79
        self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
A
Aurelius84 已提交
80 81
        self._learning_rate = optimizer._learning_rate
        self._learning_rate_map = optimizer._learning_rate_map
82 83 84
        self._use_pure_fp16 = use_pure_fp16
        self._use_fp16_guard = use_fp16_guard
        self._to_fp16_var_names = None
J
Jie Fang 已提交
85
        if self._use_dynamic_loss_scaling:
86 87
            self._incr_every_n_steps = incr_every_n_steps
            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
J
Jie Fang 已提交
88 89
            self._incr_ratio = incr_ratio
            self._decr_ratio = decr_ratio
90 91 92
            self._num_good_steps = None
            self._num_bad_steps = None

93 94 95 96 97 98
    def _set_distributed(self, flag):
        # if distributed, all cards will communication with each other,
        # overlap communication and computation by split the
        # check_finite_and_unscale op.
        self._is_distributed = flag

99 100 101
    def get_loss_scaling(self):
        """Return the real-time loss scaling factor.
        """
102
        assert self._loss_scaling is not None, 'Please call minimize() before calling get_loss_scaling().'
103 104 105 106 107 108 109 110
        return self._loss_scaling

    def get_scaled_loss(self):
        """Return the scaled loss.
        It's useful when you feed customed loss into executor.
        """
        return self._scaled_loss

111 112 113
    def _supports_check_nan_inf(self):
        return getattr(self._optimizer, "_supports_check_nan_inf", False)

114 115 116 117 118 119 120 121 122
    def _init_amp_var(self):
        self._loss_scaling = layers.create_global_var(
            name=unique_name.generate("loss_scaling"),
            shape=[1],
            value=self._init_loss_scaling,
            dtype='float32',
            persistable=True)

        if self._use_dynamic_loss_scaling:
J
Jie Fang 已提交
123 124 125 126 127 128 129 130 131 132 133 134
            self._num_good_steps = layers.create_global_var(
                name=unique_name.generate("num_good_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
            self._num_bad_steps = layers.create_global_var(
                name=unique_name.generate("num_bad_steps"),
                shape=[1],
                value=0,
                dtype='int32',
                persistable=True)
135

136
        # Ensure the data type of learning rate vars is float32 (same as the
137
        # master parameter dtype)
138 139 140 141 142 143 144 145
        if isinstance(self._optimizer._learning_rate, float):
            self._optimizer._learning_rate_map[default_main_program()] = \
                    layers.create_global_var(
                    name=unique_name.generate("learning_rate"),
                    shape=[1],
                    value=float(self._optimizer._learning_rate),
                    dtype='float32',
                    persistable=True)
146

147 148 149 150 151 152 153
    def backward(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None,
                 callbacks=None):
        """
Z
Zhen Wang 已提交
154
        Backward propagation or auto differentiation for gradients' computation.
155 156 157 158 159 160 161

        Args:
            loss (Variable): The loss Variable to minimize.
            startup_program (Program|None): The startup Program for initializing 
                                       parameters in `parameter_list`.
            parameter_list (list|None): A list of Variables to update.
            no_grad_set (set|None): A set of Variables should be ignored.
Z
Zhen Wang 已提交
162
            callbacks (list|None): A list of callable objects to run when appending
163 164 165 166 167 168
                                   backward operator for one parameter.

        Returns:
            A list of (param, grad), which is a tuple of a parameter and its 
            gradient respectively, and the scaled loss.
        """
169 170 171
        train_program = loss.block.program
        self._train_program = train_program

172 173 174 175 176 177 178
        # NOTE(zhiqiu): _float_status is only used for NPU.
        if core.is_compiled_with_npu():
            float_status = paddle.static.data(
                name="float_status", shape=[8], dtype='float32')
            self._train_program.global_block().append_op(
                type="alloc_float_status",
                outputs={"FloatStatus": float_status}, )
179 180 181 182
            self._train_program.global_block().append_op(
                type="clear_float_status",
                inputs={"FloatStatus": float_status},
                outputs={"FloatStatusOut": float_status}, )
183 184 185 186
            self._float_status = float_status
        else:
            self._float_status = None

187
        with program_guard(self._train_program, startup_program):
188 189
            self._init_amp_var()

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
            if self._use_pure_fp16:
                self._to_fp16_var_names = cast_model_to_fp16(
                    self._train_program, self._amp_lists, self._use_fp16_guard)
            else:
                rewrite_program(self._train_program, self._amp_lists)

            if loss.dtype != core.VarDesc.VarType.FP32:
                loss = loss.astype('float32')
            # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
            # the model can be optimized.
            if self._use_dynamic_loss_scaling or self._init_loss_scaling != 1.0:
                self._scaled_loss = loss * self._loss_scaling
            else:
                self._scaled_loss = loss

205 206 207
            params_grads = self._optimizer.backward(
                self._scaled_loss, startup_program, parameter_list, no_grad_set,
                callbacks)
208 209
            if self._supports_check_nan_inf():
                self._add_cast_ops_to_startup_program(startup_program)
210
        return params_grads
211

212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
    def _add_cast_ops_to_startup_program(self, startup_program):
        names = list(self._to_fp16_var_names) if self._to_fp16_var_names else []
        names.sort()
        startup_program = default_startup_program(
        ) if startup_program is None else startup_program
        block = startup_program.global_block()
        param_names = [p.name for p in block.all_parameters()]
        for name in names:
            if name not in param_names:
                continue

            tmp = block.create_var(dtype=core.VarDesc.VarType.FP32)
            block.append_op(
                type='assign', inputs={'X': [name]}, outputs={'Out': [tmp]})
            block.append_op(
                type='cast',
                inputs={'X': [tmp]},
                outputs={'Out': [name]},
                attrs={
                    'in_dtype': core.VarDesc.VarType.FP32,
                    'out_dtype': core.VarDesc.VarType.FP16,
                })
        self._to_fp16_var_names = None

236 237 238 239 240 241 242 243 244
    def amp_init(self,
                 place,
                 scope=None,
                 test_program=None,
                 use_fp16_test=False):
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
  
        Args:
H
huangxu96 已提交
245
            place(CUDAPlace): place is used to initialize 
246 247 248 249 250
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.

H
huangxu96 已提交
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
        Examples:
            .. code-block:: python

                import numpy as np
                import paddle
                import paddle.nn.functional as F
                paddle.enable_static()

                def run_example_code():
                    place = paddle.CUDAPlace(0)
                    exe = paddle.static.Executor(place)
                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                    # 1) Use fp16_guard to control the range of fp16 kernels used.
                    with paddle.static.amp.fp16_guard():
                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                        hidden = paddle.static.nn.fc(pool, size=10)
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
                    # or the slow convergence in a way. 
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
                        custom_black_list=['pool2d'])
                    # 4) The entry of Paddle AMP.
                    # Enable pure fp16 training by setting `use_pure_fp16` to True.
                    optimizer = paddle.static.amp.decorate(
                        optimizer,
                        amp_list,
                        init_loss_scaling=128.0,
                        use_dynamic_loss_scaling=True,
                        use_pure_fp16=True)
                    # If you don't use the default_startup_program(), you sholud pass
                    # your defined `startup_program` into `minimize`.
                    optimizer.minimize(loss)
                    exe.run(paddle.static.default_startup_program())
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
                    
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                    run_example_code()       
295 296 297 298 299 300 301 302 303 304 305 306 307
        """
        assert self._train_program is not None, \
            "Please call the minimize method first."
        if self._use_pure_fp16:
            cast_parameters_to_fp16(place, self._train_program, scope,
                                    self._to_fp16_var_names)
        if test_program is not None:
            if self._use_pure_fp16:
                cast_model_to_fp16(test_program, self._amp_lists,
                                   self._use_fp16_guard)
            elif use_fp16_test:
                rewrite_program(test_program, self._amp_lists)

308
    def apply_gradients(self, params_grads):
309
        """
310
        Check scaled gradients to determine whether to update loss scaling and update 
311
        parameters by their scaled gradients.
312 313
  
        Args:
314
            params_grads (list): A list of params and scaled grads.
315 316 317 318
    
        Returns:
            A list of optimize operators.
        """
J
Jie Fang 已提交
319

320 321 322 323
        # Change the op_role_var attr for some ops, so that gradients
        # transferred across GPUs can be FP16.
        update_role_var_grad(self._train_program, params_grads)

324 325 326 327 328
        # When not using dynamic loss scaling and the init loss scaling value is equal to 1.0,
        # the model can be optimized.
        if not self._use_dynamic_loss_scaling and self._init_loss_scaling == 1.0:
            return self._optimizer.apply_gradients(params_grads)

329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
        if self._supports_check_nan_inf():
            self._optimizer._set_scale(self._loss_scaling)
            optimize_ops = self._optimizer.apply_gradients(params_grads)
            found_inf = self._optimizer._found_inf
            self._add_dynamic_loss_scaling(params_grads, found_inf)
            return optimize_ops

        found_inf = self._check_finite_and_unscale(params_grads)
        if self._use_dynamic_loss_scaling:
            self._add_dynamic_loss_scaling(params_grads, found_inf)

        # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
        # With fleet, optimizers are nested and the real optimizer set by user is the inner most one.
        real_optimizer = self._optimizer
        while hasattr(real_optimizer, "inner_opt"):
            real_optimizer = real_optimizer.inner_opt
        if isinstance(real_optimizer, (paddle.fluid.optimizer.Adam,
                                       paddle.optimizer.AdamW)):
            # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we
            # copy it in advance to avoid multiple time copies.
            with self._train_program._optimized_guard([]):
                found_inf = paddle.tensor.creation._memcpy(found_inf,
                                                           paddle.CPUPlace())
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        elif hasattr(real_optimizer, "_set_auxiliary_var"):
            real_optimizer._set_auxiliary_var('found_inf', found_inf)
        optimize_ops = self._optimizer.apply_gradients(params_grads)
        return optimize_ops

    def _split_grads(self, params_grads):
359
        grads = [g for _, g in params_grads]
360 361 362 363
        fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
        fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
        assert len(fp32_grads) + len(fp16_grads) == len(grads), \
            "Data types of all grads must be either fp16 or fp32."
364
        return grads, fp32_grads, fp16_grads
365

366 367
    def _check_finite_and_unscale(self, params_grads):
        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
368
        found_infs = []
369

370
        if self._is_distributed:
371 372
            # if distributed, split check_finite_and_unscale to overlap
            # unscale with communication
373 374
            if core.is_compiled_with_npu():
                with self._train_program._optimized_guard(grads):
375
                    _, found_inf = check_finite_and_unscale(
376
                        grads,
377 378 379
                        self._loss_scaling,
                        name="find_infinite_scale",
                        float_status=self._float_status)
380
                    found_infs.append(found_inf)
381 382 383 384 385 386 387 388 389
            else:
                for p, g in params_grads:
                    with self._train_program._optimized_guard([p, g]):
                        _, found_inf = check_finite_and_unscale(
                            [g, ],
                            self._loss_scaling,
                            name="find_infinite_scale",
                            float_status=self._float_status)
                        found_infs.append(found_inf)
390 391 392 393 394 395
        elif self._use_pure_fp16:
            if fp32_grads:
                with self._train_program._optimized_guard(fp32_grads):
                    _, fp32_found_inf = check_finite_and_unscale(
                        fp32_grads,
                        self._loss_scaling,
396 397
                        name="find_infinite_scale_fp32",
                        float_status=self._float_status)
398 399 400 401 402 403
                found_infs.append(fp32_found_inf)
            if fp16_grads:
                with self._train_program._optimized_guard(fp16_grads):
                    _, fp16_found_inf = check_finite_and_unscale(
                        fp16_grads,
                        self._loss_scaling,
404 405
                        name="find_infinite_scale_fp16",
                        float_status=self._float_status)
406 407 408 409
                found_infs.append(fp16_found_inf)
        else:
            with self._train_program._optimized_guard(grads):
                _, found_inf = check_finite_and_unscale(
410 411 412 413
                    grads,
                    self._loss_scaling,
                    name="find_infinite_scale",
                    float_status=self._float_status)
J
Jie Fang 已提交
414

415 416 417 418
        if self._is_distributed or self._use_pure_fp16:
            with self._train_program._optimized_guard([]):
                all_infs = layers.concat(found_infs)
                found_inf = layers.reduce_any(all_infs)
419

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
        return found_inf

    def _add_dynamic_loss_scaling(self, params_grads, found_inf):
        if self._supports_check_nan_inf():
            with self._train_program._optimized_guard([]):
                update_loss_scaling(
                    [],
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
                    stop_update=False,
                    name="update_loss_scaling")
            return

        grads, fp32_grads, fp16_grads = self._split_grads(params_grads)
        if self._use_pure_fp16:
            stop_update = False
            with self._train_program._optimized_guard([]):
                if fp32_grads:
444
                    update_loss_scaling(
445
                        fp32_grads,
446 447 448 449 450 451 452 453
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
                        stop_update=stop_update,
                        name="update_loss_scaling_fp32")
                    stop_update = True
                if fp16_grads:
                    update_loss_scaling(
                        fp16_grads,
                        found_inf,
                        self._loss_scaling,
                        self._num_good_steps,
                        self._num_bad_steps,
                        self._incr_every_n_steps,
                        self._decr_every_n_nan_or_inf,
                        self._incr_ratio,
                        self._decr_ratio,
                        stop_update=stop_update,
                        name="update_loss_scaling_fp16")
        else:
R
Roc 已提交
471
            with self._train_program._optimized_guard([]):
472 473 474 475 476 477 478 479 480 481 482
                update_loss_scaling(
                    grads,
                    found_inf,
                    self._loss_scaling,
                    self._num_good_steps,
                    self._num_bad_steps,
                    self._incr_every_n_steps,
                    self._decr_every_n_nan_or_inf,
                    self._incr_ratio,
                    self._decr_ratio,
                    name="update_loss_scaling")
483

484 485 486 487 488 489
    def apply_optimize(self, loss, startup_program, params_grads):
        program = loss.block.program
        with program_guard(program, startup_program):
            optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops

G
gongweibao 已提交
490 491 492 493 494
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
495 496 497 498 499
        """
        Perform optimization by minimizing the given loss.

        Args:
            loss (Variable): The loss Variable.
G
gongweibao 已提交
500 501 502 503
            startup_program (Program): startup_program for initializing parameters
                in `parameter_list`.
            parameter_list (list): list of Variables to update.
            no_grad_set (set|None): set of Variables should be ignored.
504 505 506

        Returns:
            The scaled loss by scaling factor, the list of optimize ops, and a
507
            list of scaled parameters and gradients.
508
        """
509

510 511 512 513 514 515 516
        opt_dict = self._optimizer.__class__.__dict__
        if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                 types.FunctionType):
            warnings.warn(
                "The decorated optimizer has its own `minimize` method, but it will not be executed."
            )

517
        scaled_params_grads = self.backward(
G
gongweibao 已提交
518 519 520 521 522
            loss,
            startup_program=startup_program,
            parameter_list=parameter_list,
            no_grad_set=no_grad_set)

523 524
        optimize_ops = self.apply_optimize(loss, startup_program,
                                           scaled_params_grads)
525

G
gongweibao 已提交
526
        return optimize_ops, scaled_params_grads
527 528


J
Jie Fang 已提交
529
def decorate(optimizer,
J
Jie Fang 已提交
530
             amp_lists=None,
531
             init_loss_scaling=2**15,
J
Jie Fang 已提交
532 533 534 535
             incr_every_n_steps=1000,
             decr_every_n_nan_or_inf=2,
             incr_ratio=2.0,
             decr_ratio=0.8,
536 537 538
             use_dynamic_loss_scaling=True,
             use_pure_fp16=False,
             use_fp16_guard=None):
539 540 541 542 543
    """ 
    Decorate the given optimizer to adapt to the mixed-precision training.

    Args:
        optimizer(Optimizer): A common Optimizer.
H
huangxu96 已提交
544
        amp_lists (CustomOpLists): An CustomOpLists object.
545
        init_loss_scaling(float): The initial loss scaling factor.
J
Jie Fang 已提交
546 547 548 549 550 551 552 553 554
        incr_every_n_steps(int): Increases loss scaling every n consecutive 
                                 steps with finite gradients.
        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
                                      accumulated steps with nan or 
                                      inf gradients.
        incr_ratio(float): The multiplier to use when increasing the loss 
                           scaling.
        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
                           the loss scaling.
555
        use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
556 557 558
        use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
        use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                           Default None, which means that its value equals to `use_pure_fp16`.
559 560 561 562 563

    Returns:
        An optimizer acting like a normal one but with mixed-precision training 
        enabled.

H
huangxu96 已提交
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
    Examples 1:
	    .. code-block:: python

            # black&white list based strategy example
            import paddle
            import paddle.static as static

            paddle.enable_static()

            data = static.data(name='X', shape=[None, 1], dtype='float32')
            hidden = static.nn.fc(x=data, size=10)
            loss = paddle.mean(hidden)
            optimizer = paddle.optimizer.Adam(learning_rate=0.001)

            mp_optimizer = static.amp.decorate(
                    optimizer=optimizer, init_loss_scaling=8.0)
580

G
gongweibao 已提交
581
            ops, param_grads = mp_optimizer.minimize(loss)
582
            scaled_loss = mp_optimizer.get_scaled_loss()
H
huangxu96 已提交
583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627

    Examples 2:
        .. code-block:: python

            # pure fp16 training example
            import numpy as np
            import paddle
            import paddle.nn.functional as F

            def run_example_code():
                place = paddle.CUDAPlace(0)
                exe = paddle.static.Executor(place)
                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
                # 1) Use fp16_guard to control the range of fp16 kernels used.
                with paddle.static.amp.fp16_guard():
                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
                    hidden = paddle.static.nn.fc(pool, size=10)
                    loss = paddle.mean(hidden)
                # 2) Create the optimizer and set `multi_precision` to True.
                # Setting `multi_precision` to True can avoid the poor accuracy
                # or the slow convergence in a way. 
                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                amp_list = paddle.static.amp.CustomOpLists(
                    custom_black_list=['pool2d'])
                # 4) The entry of Paddle AMP.
                # Enable pure fp16 training by setting `use_pure_fp16` to True.
                optimizer = paddle.static.amp.decorate(
                    optimizer,
                    amp_list,
                    init_loss_scaling=128.0,
                    use_dynamic_loss_scaling=True,
                    use_pure_fp16=True)
                # If you don't use the default_startup_program(), you sholud pass
                # your defined `startup_program` into `minimize`.
                optimizer.minimize(loss)
                exe.run(paddle.static.default_startup_program())
                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                optimizer.amp_init(place, scope=paddle.static.global_scope())
                
            if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                run_example_code()
628
    """
J
Jie Fang 已提交
629 630
    if amp_lists is None:
        amp_lists = AutoMixedPrecisionLists()
631 632 633 634

    if use_fp16_guard is None:
        use_fp16_guard = use_pure_fp16

Z
Zhen Wang 已提交
635
    mp_optimizer = OptimizerWithMixedPrecision(
J
Jie Fang 已提交
636
        optimizer, amp_lists, init_loss_scaling, use_dynamic_loss_scaling,
637 638
        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
        use_pure_fp16, use_fp16_guard)
639 640

    return mp_optimizer