[Cherry-pick] Add pure fp16 training with master weights. (#29301)

* Add pure fp16 training with master weights. (#27712) * add the weight decay func for the momentum op * Add the multi_precision function in Momentum Optimizer. * Make sure that the initial value of master weights are same with the fp16 weights. * add static loss scaling. * add the rescale_grad function in the pure fp16 training. * use the original momentum updating method. * Polish some codes, such as variable names. * add docstring for apis. * update the var creation details of _create_master_weight. * not modify codes about imperative momentum updating. * Fix the error of test_dist_sparse_tensor_load_momentum UT. * add unit test for multi precision fp16 training. * add more unit tests for CI. * Use lower threshold values for allclose comparing in test_multi_precision_fp16_train UT.

[Cherry-pick] Add pure fp16 training with master weights. (#29301)
* Add pure fp16 training with master weights. (#27712) * add the weight decay func for the momentum op * Add the multi_precision function in Momentum Optimizer. * Make sure that the initial value of master weights are same with the fp16 weights. * add static loss scaling. * add the rescale_grad function in the pure fp16 training. * use the original momentum updating method. * Polish some codes, such as variable names. * add docstring for apis. * update the var creation details of _create_master_weight. * not modify codes about imperative momentum updating. * Fix the error of test_dist_sparse_tensor_load_momentum UT. * add unit test for multi precision fp16 training. * add more unit tests for CI. * Use lower threshold values for allclose comparing in test_multi_precision_fp16_train UT.
d8ea8a06 · Zhen Wang · GitHub · 9cec0161 · d8ea8a06 · d8ea8a06
9 changed file
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -49,13 +49,17 @@ void MomentumOpMaker::Make() {
  AddInput("LearningRate",
           "(Tensor, default Tensor<float>) "
           "Input learning rate");
-
+  AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
  AddOutput("ParamOut",
            "(Tensor) This output is updated parameter. "
            "It shared memory with Input(Param).");
  AddOutput("VelocityOut",
            "(Tensor) This output is updated velocity. "
            "It shared memory with Input(Velocity).");
+  AddOutput("MasterParamOut",
+            "The updated FP32 master weight for AMP. "
+            "It shared memory with Input(MasterParam).")
+      .AsDispensable();

  AddAttr<float>("mu", "(float) Momentum coefficient");
  AddAttr<bool>("use_nesterov",
@@ -67,7 +71,17 @@ void MomentumOpMaker::Make() {
      "(string) regularization_method, right now only support l2decay or none")
      .SetDefault("");
  AddAttr<float>("regularization_coeff", "(float) regularization_coeff")
-      .SetDefault(0);
+      .SetDefault(0.0f);
+  AddAttr<bool>("multi_precision",
+                "(bool, default false) "
+                "Whether to use multi-precision during weight updating.")
+      .SetDefault(false);
+  AddAttr<float>(
+      "rescale_grad",
+      "(float, default 1.0) Multiply the gradient with `rescale_grad`"
+      "before updating. Often choose to be `1.0/batch_size`.")
+      .SetDefault(1.0f);
+
  AddComment(R"DOC(
 Momentum Optimizer.

@@ -109,4 +123,12 @@ REGISTER_OP_VERSION(momentum)
                     "l2decay or none",
                     std::string(""))
            .NewAttr("regularization_coeff", "(float) regularization_coeff",
-                     0.0f));
+                     0.0f)
+            .NewAttr(
+                "multi_precision",
+                "(bool) Whether to use multi-precision during weight updating.",
+                false)
+            .NewAttr("rescale_grad",
+                     "(float) Multiply the gradient with `rescale_grad`"
+                     "before updating. Often choose to be `1.0/batch_size`.",
+                     1.0f));
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -54,6 +54,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
+    {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
 };

 // NOTE(zhiqiu): Like op_ins_map.
@@ -82,6 +83,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"momentum", {"ParamOut", "VelocityOut"}},
 };

 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are

--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -16,6 +16,12 @@ from __future__ import print_function

 from ... import core
 from ... import layers
+from ... import global_scope
+from ...log_helper import get_logger
+import logging
+import numpy as np
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')


 def _rename_arg(op, old_name, new_name):
@@ -190,6 +196,127 @@ def _is_in_black_varnames(op, amp_lists):
    return False


+def cast_model_to_fp16(main_program):
+    """
+    Traverse all ops in the whole model and set their inputs and outputs
+    to the fp16 data type. This function will do some special process for
+    the batch normalization, which keeps the computational process of
+    batchnorms in FP32.
+    Args:
+        main_program (Program): The main program for training.
+    """
+    valid_types = [
+        core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
+        core.VarDesc.VarType.LOD_TENSOR_ARRAY
+    ]
+    global_block = main_program.global_block()
+
+    for block in main_program.blocks:
+        ops = block.ops
+        for op in ops:
+            if op.type == 'create_py_reader' or op.type == 'read':
+                continue
+            for in_name in op.input_names:
+                if op.type in {
+                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+                } and in_name not in {'X', 'Z'}:
+                    continue
+                for in_var_name in op.input(in_name):
+                    in_var = None
+                    try:
+                        in_var = block.var(in_var_name)
+                    except ValueError as e:
+                        _logger.debug(
+                            "-- {}, try to get it in the global block. --".
+                            format(e))
+                        in_var = global_block.var(in_var_name)
+                        if in_var is not None:
+                            _logger.debug(
+                                "-- var {} is got in the global block. --".
+                                format(in_var_name))
+
+                    if in_var is None or in_var.type not in valid_types:
+                        continue
+
+                    if in_var.dtype == core.VarDesc.VarType.FP32:
+                        in_var.desc.set_dtype(core.VarDesc.VarType.FP16)
+
+                    _logger.debug(
+                        "-- op type: {}, in var name: {}, in var dtype: {} --".
+                        format(op.type, in_var_name, in_var.dtype))
+
+            for out_name in op.output_names:
+                if op.type in {
+                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+                } and out_name != 'Y':
+                    continue
+                for out_var_name in op.output(out_name):
+                    out_var = None
+                    try:
+                        out_var = block.var(out_var_name)
+                    except ValueError as e:
+                        _logger.debug(
+                            "-- {}, try to get it in the global block. --".
+                            format(e))
+                        out_var = global_block.var(out_var_name)
+                        if out_var is not None:
+                            _logger.debug(
+                                "-- var {} is got in the global block. --".
+                                format(out_var_name))
+
+                    if out_var is None or out_var.type not in valid_types:
+                        continue
+
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.FP16)
+
+                    _logger.debug(
+                        "-- op type: {}, out var name: {}, out var dtype: {} --".
+                        format(op.type, out_var_name, out_var.dtype))
+            if op.has_attr('in_dtype') and op.attr(
+                    'in_dtype') == core.VarDesc.VarType.FP32:
+                op._set_attr('in_dtype', core.VarDesc.VarType.FP16)
+            if op.has_attr('out_dtype') and op.attr(
+                    'out_dtype') == core.VarDesc.VarType.FP32:
+                op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
+            if op.has_attr('dtype') and op.attr(
+                    'dtype') == core.VarDesc.VarType.FP32:
+                op._set_attr('dtype', core.VarDesc.VarType.FP16)
+
+
+def cast_parameters_to_fp16(place, main_program, scope=None):
+    """
+    Traverse all parameters in the whole model and set them to the fp16 data type.
+    Whereas, this function will keep parameters of batchnorms in FP32.
+    Args:
+        place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
+        main_program (Program): The main program for training.
+        scope(fluid.Scope, optional): scope is used to get the weight tensor values.
+        Default is None.
+    """
+    all_ops = []
+    for block in main_program.blocks:
+        all_ops.extend(block.ops)
+    bn_params = set()
+    for op in all_ops:
+        if op.type not in {
+                'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+        }:
+            continue
+        for in_name in op.input_names:
+            if in_name not in {'X', 'Z'}:
+                for in_var_name in op.input(in_name):
+                    bn_params.add(in_var_name)
+    global_block = main_program.global_block()
+    all_parameters = global_block.all_parameters()
+    var_scope = scope if scope is not None else global_scope()
+    for param in all_parameters:
+        if param.name not in bn_params:
+            param_t = var_scope.find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            param_t.set(np.float16(data), place)
+
+
 def rewrite_program(main_prog, amp_lists):
    """
    Traverse all ops in current block and insert cast op according to 

--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -14,11 +14,13 @@
 from paddle.fluid.optimizer import Optimizer
 from paddle.fluid.regularizer import L1DecayRegularizer
 from paddle.fluid.regularizer import L2DecayRegularizer
-from paddle.fluid.regularizer import append_regularization_ops
-from paddle.fluid import framework
 from paddle.fluid import core
+from paddle.fluid import framework
 from paddle.fluid.framework import program_guard
-from paddle.fluid.clip import append_gradient_clip_ops
+from paddle.fluid import unique_name
+from paddle.fluid import layers
+from paddle.fluid.layer_helper import LayerHelper
+import warnings

 __all__ = ['Momentum']

@@ -61,6 +63,9 @@ class Momentum(Optimizer):
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
+        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
+            Often choose to be ``1.0/batch_size``.
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

@@ -105,6 +110,8 @@ class Momentum(Optimizer):
                 use_nesterov=False,
                 regularization=None,
                 grad_clip=None,
+                 multi_precision=False,
+                 rescale_grad=1.0,
                 name=None):
        assert learning_rate is not None
        assert momentum is not None
@@ -124,11 +131,68 @@ class Momentum(Optimizer):
        if (isinstance(regularization, L2DecayRegularizer)):
            self._regularization_method = "l2_decay"
            self._regularization_coeff = regularization._regularization_coeff
+        self._multi_precision = multi_precision
+        self._rescale_grad = rescale_grad
+        self._master_weights = {}
+
+    def _create_master_weight(self, param):
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_fp32_master"
+        var_name = unique_name.generate(var_name)
+        var = layers.create_global_var(
+            name=var_name,
+            shape=param.shape,
+            value=0,
+            dtype='float32',
+            persistable=True)
+        block = self.helper.startup_program.global_block()
+        block.append_op(
+            type="cast",
+            inputs={"X": [param]},
+            outputs={"Out": [var]},
+            attrs={
+                "in_dtype": param.dtype,
+                "out_dtype": core.VarDesc.VarType.FP32
+            })
+        self._master_weights[param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+
+        Returns:
+            accumulator variable for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        target_param = self._master_weights[
+            param.name] if find_master else param
+        target_name = target_param.name
+        if (name not in self._accumulators or
+                target_name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, target_name))
+        return self._accumulators[name][target_name]

    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)

        for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                self._add_accumulator(self._velocity_acc_str, master_p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Momentum optimizer."
+                )
            self._add_accumulator(self._velocity_acc_str, p)

    def _append_optimize_op(self, block, param_and_grad):
@@ -136,6 +200,10 @@ class Momentum(Optimizer):

        velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                             param_and_grad[0])
+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
        lr = self._create_param_lr(param_and_grad)

        if framework.in_dygraph_mode():
@@ -151,7 +219,9 @@ class Momentum(Optimizer):
            "mu": self._momentum,
            "use_nesterov": self._use_nesterov,
            "regularization_method": self._regularization_method,
-            "regularization_coeff": self._regularization_coeff
+            "regularization_coeff": self._regularization_coeff,
+            "multi_precision": find_master,
+            "rescale_grad": self._rescale_grad
        }
        inputs = {
            "Param": [param_and_grad[0]],
@@ -159,11 +229,15 @@ class Momentum(Optimizer):
            "Velocity": [velocity_acc],
            "LearningRate": [lr]
        }
-
        outputs = {
            "ParamOut": [param_and_grad[0]],
            "VelocityOut": [velocity_acc]
        }
+
+        if find_master:
+            inputs["MasterParam"] = master_weight
+            outputs["MasterParamOut"] = master_weight
+
        # create the momentum optimize op
        momentum_op = block.append_op(
            type=self.type,

--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")

+list(REMOVE_ITEM TEST_OPS test_multi_precision_fp16_train)
+
 foreach(src ${TEST_OPS})
        py_test(${src} SRCS ${src}.py)
 endforeach()
+
+py_test_modules(test_multi_precision_fp16_train MODULES test_multi_precision_fp16_train ENVS FLAGS_cudnn_deterministic=true FLAGS_cudnn_batchnorm_spatial_persistent=true FLAGS_conv_workspace_size_limit=1000)
+
 set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120)
+set_tests_properties(test_multi_precision_fp16_train PROPERTIES TIMEOUT 120)
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+import numpy as np
+from paddle.fluid.contrib.mixed_precision.fp16_utils import cast_model_to_fp16
+from paddle.fluid.contrib.mixed_precision.fp16_utils import cast_parameters_to_fp16
+
+paddle.enable_static()
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) // 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def compile(program, loss_name=None):
+    build_strategy = paddle.static.BuildStrategy()
+    exec_strategy = paddle.static.ExecutionStrategy()
+
+    exec_strategy.num_threads = 1
+    exec_strategy.num_iteration_per_drop_scope = 10000
+
+    build_strategy.fuse_bn_act_ops = True
+    build_strategy.fuse_elewise_add_act_ops = True
+    build_strategy.fuse_bn_add_act_ops = True
+
+    compiled_program = paddle.static.CompiledProgram(
+        program).with_data_parallel(
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+    return compiled_program
+
+
+def train(use_pure_fp16=True, use_nesterov=False):
+    classdim = 10
+    data_shape = [3, 32, 32]
+    BATCH_SIZE = 128
+    PASS_NUM = 1
+
+    train_program = fluid.Program()
+    startup_prog = fluid.Program()
+    train_program.random_seed = 123
+    startup_prog.random_seed = 456
+    with fluid.program_guard(train_program, startup_prog):
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        net = resnet_cifar10(images, 32)
+
+        logits = fluid.layers.fc(input=net, size=classdim, act="softmax")
+        if use_pure_fp16:
+            cast_model_to_fp16(fluid.default_main_program())
+            logits_fp32 = fluid.layers.cast(x=logits, dtype="float32")
+        else:
+            logits_fp32 = logits
+        cost = fluid.layers.softmax_with_cross_entropy(
+            logits_fp32, label, return_softmax=False)
+        sum_cost = fluid.layers.reduce_sum(cost)
+
+        # Test program
+        test_program = train_program.clone(for_test=True)
+
+        optimizer = fluid.contrib.optimizer.Momentum(
+            learning_rate=0.001,
+            momentum=0.9,
+            use_nesterov=use_nesterov,
+            regularization=fluid.regularizer.L2Decay(1e-4),
+            multi_precision=use_pure_fp16,
+            rescale_grad=1.0 / BATCH_SIZE)
+
+        optimizer.minimize(sum_cost)
+
+    # no shuffle for unit test
+    train_reader = paddle.batch(
+        paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+
+    def train_loop(main_program):
+        exe.run(startup_prog)
+        if use_pure_fp16:
+            cast_parameters_to_fp16(place, train_program, fluid.global_scope())
+        compiled_program = compile(train_program, sum_cost.name)
+        loss = 0.0
+        for pass_id in range(PASS_NUM):
+            train_loss_list = []
+            for batch_id, data in enumerate(train_reader()):
+                loss, = exe.run(compiled_program,
+                                feed=feeder.feed(data),
+                                fetch_list=[sum_cost])
+                print('PassID {0:1}, Train Batch ID {1:04}, train loss {2:2.4}'.
+                      format(pass_id, batch_id + 1, float(loss)))
+                train_loss_list.append(float(loss))
+
+                if batch_id >= 4:  # For speeding up CI
+                    test_loss_list = []
+                    for tid, test_data in enumerate(test_reader()):
+                        loss_t, = exe.run(program=test_program,
+                                          feed=feeder.feed(test_data),
+                                          fetch_list=[sum_cost])
+                        test_loss_list.append(float(loss_t))
+                        print(
+                            'PassID {0:1}, Test Batch ID {1:04}, test loss {2:2.4}'.
+                            format(pass_id, tid + 1, float(loss_t)))
+                        if tid >= 4:
+                            break  # For speeding up CI
+                    return train_loss_list, test_loss_list
+
+    return train_loop(train_program)
+
+
+class TestImageMultiPrecision(unittest.TestCase):
+    def test_resnet_pure_fp16(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+
+        def do_test(use_nesterov=False):
+            suffix = "with Nesterov" if use_nesterov else "without Nesterov"
+            with self.scope_prog_guard():
+                print("-----------------FP16 Train {}-----------------".format(
+                    suffix))
+                train_loss_fp16, test_loss_fp16 = train(
+                    use_pure_fp16=True, use_nesterov=use_nesterov)
+            with self.scope_prog_guard():
+                print("-----------------FP32 Train {}-----------------".format(
+                    suffix))
+                train_loss_fp32, test_loss_fp32 = train(
+                    use_pure_fp16=False, use_nesterov=use_nesterov)
+
+            self.assertTrue(
+                np.allclose(
+                    np.array(train_loss_fp16),
+                    np.array(train_loss_fp32),
+                    rtol=1e-02,
+                    atol=1e-05,
+                    equal_nan=True),
+                msg='Failed to train in pure FP16.')
+            self.assertTrue(
+                np.allclose(
+                    np.array(test_loss_fp16),
+                    np.array(test_loss_fp32),
+                    rtol=1e-02,
+                    atol=1e-05,
+                    equal_nan=True),
+                msg='Failed to test in pure FP16.')
+
+        do_test(use_nesterov=False)
+        do_test(use_nesterov=True)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+class TestAmpWithNonIterableDataLoader(unittest.TestCase):
+    def decorate_with_data_loader(self):
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            with paddle.fluid.unique_name.guard():
+                image = fluid.layers.data(
+                    name='image', shape=[3, 224, 224], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+                py_reader = fluid.io.DataLoader.from_generator(
+                    feed_list=[image, label],
+                    capacity=4,
+                    iterable=False,
+                    use_double_buffer=False)
+                zero_var = fluid.layers.fill_constant(
+                    shape=[1], dtype='int64', value=0)
+                one_var = fluid.layers.fill_constant(
+                    shape=[1], dtype='int64', value=1)
+                with fluid.layers.control_flow.Switch() as switch:
+                    with switch.case(label != zero_var):
+                        fluid.layers.assign(input=zero_var, output=label)
+                    with switch.default():
+                        fluid.layers.assign(input=one_var, output=label)
+
+                net = resnet_cifar10(image)
+                logits = fluid.layers.fc(input=net, size=10, act="softmax")
+
+        block = main_prog.global_block()
+        for op in block.ops:
+            if op.type == "mul":
+                op._set_attr('in_dtype', fluid.core.VarDesc.VarType.FP32)
+                op._set_attr('out_dtype', fluid.core.VarDesc.VarType.FP32)
+                op._set_attr('dtype', fluid.core.VarDesc.VarType.FP32)
+
+        cast_model_to_fp16(main_prog)
+
+    def test_non_iterable_dataloader(self):
+        self.decorate_with_data_loader()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
@@ -246,7 +246,8 @@ def _append_pserver_ops(optimize_block, opt_op, endpoint, grad_to_block_id,
    for key in opt_op.input_names:
        new_shape = None
        if key in [
-                "Param", "Grad", "LearningRate", "Beta1Tensor", "Beta2Tensor"
+                "Param", "Grad", "LearningRate", "MasterParam", "Beta1Tensor",
+                "Beta2Tensor"
        ]:
            continue
        var = origin_program.global_block().vars[opt_op.input(key)[0]]

--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -59,7 +59,7 @@ class TestMomentumOp1(OpTest):
        param = np.random.random((123, 321)).astype(self.dtype)
        grad = np.random.random((123, 321)).astype(self.dtype)
        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(np.float32)
        mu = 0.0001
        use_nesterov = False

@@ -217,7 +217,7 @@ class TestSparseMomentumOp(unittest.TestCase):
                                        0.0).astype("float32")
        velocity_out.set(velocity_out_np_array, place)

-        # create and initialize LeraningRate Variable
+        # create and initialize LearningRate Variable
        lr = scope.var('LearningRate').get_tensor()
        lr_array = np.full((1), 2.0).astype("float32")
        lr.set(lr_array, place)
@@ -278,6 +278,115 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
        self.use_nesterov = True


+class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
+    def setUp(self):
+        self.init_args()
+        self.regularization_method = ""
+        self.regularization_coeff = 1.0
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        # create and initialize Grad Variable
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        mu = 1.0
+        use_nesterov = self.use_nesterov
+        regularization_method = self.regularization_method
+        regularization_coeff = self.regularization_coeff
+
+        # create and initialize Param Variable
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param_out_array = np.full((height, row_numel), 0.0).astype("float32")
+
+        param = scope.var('Param').get_tensor()
+        param.set(param_array.astype("float16"), place)
+        param_out = scope.var("ParamOut").get_tensor()
+        param_out.set(param_out_array.astype("float16"), place)
+
+        master_param = scope.var('MasterParam').get_tensor()
+        master_param.set(param_array, place)
+        master_param_out = scope.var("MasterParamOut").get_tensor()
+        master_param_out.set(param_out_array, place)
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        grad_np_array = np.ones((len(rows), row_numel)).astype("float32")
+        grad_np_array[0, 0] = 2.0
+        grad_np_array[2, 8] = 4.0
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_np_array.astype("float16"), place)
+
+        velocity = scope.var('Velocity').get_tensor()
+        velocity_np_array = np.ones((height, row_numel)).astype("float32")
+        velocity.set(velocity_np_array, place)
+        velocity_out = scope.var('VelocityOut').get_tensor()
+        velocity_out_np_array = np.full((height, row_numel),
+                                        0.0).astype("float32")
+        velocity_out.set(velocity_out_np_array, place)
+
+        # create and initialize LearningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and run operator
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            MasterParam='MasterParam',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            MasterParamOut='MasterParamOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov,
+            regularization_method=regularization_method,
+            regularization_coeff=regularization_coeff,
+            multi_precision=True,
+            rescale_grad=1.0)
+        op.run(scope, place)
+
+        # get and compare result
+        param_out_np_array = np.array(param_out)
+        velocity_out_np_array = np.array(velocity_out)
+
+        _grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
+        for i in range(len(rows)):
+            _grad_np_array[rows[i]] = grad_np_array[i]
+
+        _param = param_array
+
+        _param_out, _velocity_out = calculate_momentum_by_numpy(
+            param=_param,
+            grad=_grad_np_array,
+            mu=mu,
+            velocity=velocity_np_array,
+            use_nesterov=use_nesterov,
+            learning_rate=lr_array,
+            regularization_method=regularization_method,
+            regularization_coeff=regularization_coeff)
+
+        self.assertTrue((_velocity_out == velocity_out_np_array).all())
+        self.assertTrue((_param_out == param_out_np_array).all())
+
+    def init_args(self):
+        self.use_nesterov = False
+
+    def test_sparse_momentum(self):
+        if core.is_compiled_with_cuda():
+            self.check_with_place(fluid.CUDAPlace(0))
+
+
+class TestSparseMomentumOpWithMultiPrecision2(
+        TestSparseMomentumOpWithMultiPrecision):
+    def init_args(self):
+        self.use_nesterov = True
+
+
 class TestMomentumV2(unittest.TestCase):
    def test_momentum_dygraph(self):
        paddle.disable_static()
@@ -334,7 +443,7 @@ class TestMomentumOpWithDecay(OpTest):
        param = np.random.random((123, 321)).astype(self.dtype)
        grad = np.random.random((123, 321)).astype(self.dtype)
        velocity = np.zeros((123, 321)).astype(self.dtype)
-        learning_rate = np.array([0.001]).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(np.float32)
        mu = 0.0001
        use_nesterov = self.use_nesterov
        regularization_method = self.regularization_method