[AMP] Support pure fp16 training mode for dygraph (#35521)

* add pure fp16 major function in auto_cast & tracer * support master weight in dygraph for pure fp16 * check mix dtype of fp16&fp32 for check_finite_and_unscale op * change pure fp16 funtion name * refine some bug in auto_cast * refine auto_cast interface logic * add param _casted_by_pure_fp16 for class Layer * support state_dict hook for save model by user appointed dtype in pure_fp16_decorator * refine pure_fp16_decorator as decorator * add unittest * add comment * add comment * support recompute * add comment for auto_cast and decorator * support to_static_state_dict for paddle.jit.save * unlimite models num and optimizers num * add lookup_table in black_list * fix momentum and layer state_dict * fix bug in layer state_dict * fix bug in layer state_dict_helper * refine unittest * refine test_momentun_op * refine interface and some code * refine amp_decorator interface * refine pure fp16 interface * refine master weight interface

[AMP] Support pure fp16 training mode for dygraph (#35521)
* add pure fp16 major function in auto_cast & tracer * support master weight in dygraph for pure fp16 * check mix dtype of fp16&fp32 for check_finite_and_unscale op * change pure fp16 funtion name * refine some bug in auto_cast * refine auto_cast interface logic * add param _casted_by_pure_fp16 for class Layer * support state_dict hook for save model by user appointed dtype in pure_fp16_decorator * refine pure_fp16_decorator as decorator * add unittest * add comment * add comment * support recompute * add comment for auto_cast and decorator * support to_static_state_dict for paddle.jit.save * unlimite models num and optimizers num * add lookup_table in black_list * fix momentum and layer state_dict * fix bug in layer state_dict * fix bug in layer state_dict_helper * refine unittest * refine test_momentun_op * refine interface and some code * refine amp_decorator interface * refine pure fp16 interface * refine master weight interface
adaeee4d · zhangbo9674 · GitHub · 68ae6345 · adaeee4d · adaeee4d
21 changed file
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -117,7 +117,7 @@ static inline std::shared_ptr<imperative::VarBase> CastToType(
  imperative::NameVarBaseMap outs = {{"Out", {out}}};

  {
-    AutoCastGuard guard(tracer, false);
+    AutoCastGuard guard(tracer, 0);
    tracer->TraceOp("cast", ins, outs, std::move(attrs));
  }

@@ -225,5 +225,30 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
  return new_ins;
 }

+NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
+                                  const NameVarBaseMap& ins) {
+  NameVarBaseMap new_ins(ins);
+  auto dst_type = framework::proto::VarType::FP16;
+  if (AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(op_type) ||
+      AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) {
+    dst_type = framework::proto::VarType::FP32;
+  }
+  for (auto& pair : new_ins) {
+    if ((op_type == "batch_norm" || op_type == "layer_norm" ||
+         op_type == "sync_batch_norm") &&
+        pair.first != "X") {
+      continue;
+    }
+    VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+            << GetDtypeStr(*pair.second.cbegin()) << " to "
+            << framework::DataTypeToString(dst_type);
+    for (auto& var : pair.second) {
+      var = (dst_type == framework::proto::VarType::FP32 ? CastToFP32(var)
+                                                         : CastToFP16(var));
+    }
+  }
+  return new_ins;
+}
+
 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -63,15 +63,16 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
 // NOTE(zhiqiu): AutoCastGuard is used for RAII.
 class AutoCastGuard {
 public:
-  AutoCastGuard(std::shared_ptr<Tracer> tracer, bool guard_mode)
+  AutoCastGuard(std::shared_ptr<Tracer> tracer, int guard_level)
      : tracer_(tracer) {
-    pre_mode_ = tracer_->IsAutoCastEnabled();
-    if (pre_mode_ != guard_mode) {
-      tracer_->SetEnableAutoCast(guard_mode);
+    pre_amp_level_ = tracer_->AMPLevel();
+
+    if (pre_amp_level_ != guard_level) {
+      tracer_->SetAMPLevel(guard_level);
    }
  }

-  ~AutoCastGuard() { tracer_->SetEnableAutoCast(pre_mode_); }
+  ~AutoCastGuard() { tracer_->SetAMPLevel(pre_amp_level_); }

  // forbid copy and operator=
  AutoCastGuard(const AutoCastGuard& guard) = delete;
@@ -79,11 +80,14 @@ class AutoCastGuard {

 private:
  std::shared_ptr<Tracer> tracer_;
-  bool pre_mode_;
+  int pre_amp_level_;
 };

 NameVarBaseMap AutoCastInputs(const std::string& op_type,
                              const NameVarBaseMap& ins);

+NameVarBaseMap CastPureFp16Inputs(const std::string& op_type,
+                                  const NameVarBaseMap& ins);
+
 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -176,9 +176,12 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                              : attr_checker->GetDefaultAttrMap();

  NameVarBaseMap new_ins = ins;
-  if (enable_autocast_) {
+  if (amp_level_ == 1) {
    VLOG(5) << "Auto mixed precision run operator: " << type;
    new_ins = AutoCastInputs(type, ins);
+  } else if (amp_level_ == 2) {
+    VLOG(5) << "Pure fp16 run operator: " << type;
+    new_ins = CastPureFp16Inputs(type, ins);
  }

  try {

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -105,9 +105,9 @@ class Tracer {

  void SetHasGrad(bool has_grad) { has_grad_ = has_grad; }

-  void SetEnableAutoCast(bool enabled) { enable_autocast_ = enabled; }
+  void SetAMPLevel(int level) { amp_level_ = level; }

-  bool IsAutoCastEnabled() const { return enable_autocast_; }
+  int AMPLevel() const { return amp_level_; }

  paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
      const platform::Place& place);
@@ -118,9 +118,9 @@ class Tracer {
  bool enable_program_desc_tracing_{false};
  std::unique_ptr<UniqueNameGenerator> generator_;
  platform::Place expected_place_;
-  bool enable_autocast_{false};
  GarbageCollectorMap gcs_;
  static thread_local bool has_grad_;
+  int amp_level_{0};
 };

 // To access static variable current_tracer

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1947,8 +1947,8 @@ void BindImperative(py::module *m_ptr) {
      .def_property("_enable_program_desc_tracing",
                    &imperative::Tracer::IsProgramDescTracingEnabled,
                    &imperative::Tracer::SetEnableProgramDescTracing)
-      .def_property("_enable_autocast", &imperative::Tracer::IsAutoCastEnabled,
-                    &imperative::Tracer::SetEnableAutoCast)
+      .def_property("_amp_level", &imperative::Tracer::AMPLevel,
+                    &imperative::Tracer::SetAMPLevel)
      .def_property("_has_grad", &imperative::Tracer::HasGrad,
                    &imperative::Tracer::SetHasGrad)
      .def_property(

--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -63,11 +63,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
-    {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
+    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
    {"run_program", {"X", "Params"}},
-    {"matrix_rank", {"X", "TolTensor"}}};
+    {"matrix_rank", {"X", "TolTensor"}},
+    {"adam",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+};

 // NOTE(zhiqiu): Like op_ins_map.
 // Commonly, the outputs in auto-generated OP function are determined by the
@@ -97,12 +101,15 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"Out", "OutScale", "OutAccum", "OutState"}},
    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
-    {"momentum", {"ParamOut", "VelocityOut"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
    {"lamb",
     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
    {"run_program", {"DOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
 };

 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -119,13 +126,14 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
 std::map<std::string, std::set<std::string>> op_passing_outs_map = {
    {"sgd", {"ParamOut"}},
    {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
    {"adamw",
     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
    {"average_accumulates",
     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
      "out_old_num_accumulates", "out_num_updates"}},
-    {"momentum", {"ParamOut", "VelocityOut"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
    {"batch_norm", {"MeanOut", "VarianceOut"}},
    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},

--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -14,5 +14,6 @@

 from .auto_cast import auto_cast  # noqa: F401
 from .grad_scaler import GradScaler  # noqa: F401
+from .auto_cast import decorate  # noqa: F401

-__all__ = ['auto_cast', 'GradScaler']
+__all__ = ['auto_cast', 'GradScaler', 'decorate']
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -13,18 +13,22 @@
 # limitations under the License.

 from paddle.fluid.dygraph.amp import amp_guard
+from paddle.fluid.dygraph.amp import amp_decorate

 __all__ = []


-def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
+def auto_cast(enable=True,
+              custom_white_list=None,
+              custom_black_list=None,
+              level='O1'):
    """
    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
    If enabled, the input data type (float32 or float16) of each operator is decided 
    by autocast algorithm for better performance. 
    
    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
-    imperative mode.
+    imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.

    Args:
        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
@@ -34,6 +38,8 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
             calculation and are considered numerically-dangerous and whose effects may also be 
             observed in downstream ops. These ops will not be converted to fp16.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+             O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
        
    Examples:

@@ -61,6 +67,67 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
        with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
            c = a + b
            print(c.dtype) # FP16
+        
+        with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
+            d = a + b
+            print(d.dtype) # FP16
+
+    """
+    return amp_guard(enable, custom_white_list, custom_black_list, level)
+
+
+def decorate(models,
+             optimizers=None,
+             level='O1',
+             master_weight=None,
+             save_dtype=None):
+    """
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
+    
+    Commonly, it is used together with `auto_cast` to achieve Pure fp16 in imperative mode.
+
+    Args:
+        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
+        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+             O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
+        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
+        save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
+             The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
+
+    Examples:
+
+     .. code-block:: python   
+
+        # required: gpu
+        # Demo1: single model and optimizer:
+        import paddle
+
+        model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+
+        model, optimizer = paddle.amp.decorate(models=model, optimizers=optimzier, level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
+
+        with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = model(data)
+            print(output.dtype) # FP16
+            
+        # required: gpu
+        # Demo2: multi models and optimizers:
+        model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
+
+        models, optimizers = paddle.amp.decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])

+        with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = models[0](data)
+            output2 = models[1](data)
+            print(output.dtype) # FP16
+            print(output2.dtype) # FP16
    """
-    return amp_guard(enable, custom_white_list, custom_black_list)
+    return amp_decorate(models, optimizers, level, master_weight, save_dtype)
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -198,7 +198,11 @@ class _HPRecomputeFunction(PyLayer):

        # TODO support AMP
        tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = tracer._enable_autocast
+        if tracer._amp_level == 0:
+            ctx.is_fw_autocast = False
+        else:
+            ctx.is_fw_autocast = True
+        ctx.amp_mode = 'O1'
        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()

        with paddle.no_grad():
@@ -258,7 +262,8 @@ class _HPRecomputeFunction(PyLayer):
                with paddle.amp.auto_cast(
                        enable=ctx.is_fw_autocast,
                        custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list):
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_mode):
                    detached_inputs = detach_variable(tuple(inputs))
                    outputs = ctx.run_function(*detached_inputs)


--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -98,7 +98,11 @@ class RecomputeFunction(PyLayer):

        # TODO support AMP
        tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = tracer._enable_autocast
+        if tracer._amp_level == 0:
+            ctx.is_fw_autocast = False
+        else:
+            ctx.is_fw_autocast = True
+        ctx.amp_mode = 'O1'
        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()

        with paddle.no_grad():
@@ -128,14 +132,16 @@ class RecomputeFunction(PyLayer):
                    with paddle.amp.auto_cast(
                            enable=ctx.is_fw_autocast,
                            custom_white_list=ctx.amp_white_list,
-                            custom_black_list=ctx.amp_black_list):
+                            custom_black_list=ctx.amp_black_list,
+                            level=ctx.amp_mode):
                        detached_inputs = detach_variable(tuple(inputs))
                        outputs = ctx.run_function(*detached_inputs)
            else:
                with paddle.amp.auto_cast(
                        enable=ctx.is_fw_autocast,
                        custom_white_list=ctx.amp_white_list,
-                        custom_black_list=ctx.amp_black_list):
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_mode):
                    detached_inputs = detach_variable(tuple(inputs))
                    outputs = ctx.run_function(*detached_inputs)


--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
@@ -203,19 +203,21 @@ class Momentum(Optimizer):
                                             param_and_grad[0])
        lr = self._create_param_lr(param_and_grad)

-        if framework.in_dygraph_mode():
-            _, _ = _C_ops.momentum(
-                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                param_and_grad[0], velocity_acc, 'mu', self._momentum,
-                'use_nesterov', self._use_nesterov, 'regularization_method',
-                self._regularization_method, 'regularization_coeff',
-                self._regularization_coeff)
-            return None
-
        find_master = self._multi_precision and param_and_grad[
            0].dtype == core.VarDesc.VarType.FP16
        master_weight = (self._master_weights[param_and_grad[0].name]
                         if find_master else None)
+
+        if framework.in_dygraph_mode():
+            _, _, _ = _C_ops.momentum(
+                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov,
+                'regularization_method', self._regularization_method,
+                'regularization_coeff', self._regularization_coeff,
+                'multi_precision', find_master)
+            return None
+
        attrs = {
            "mu": self._momentum,
            "use_nesterov": self._use_nesterov,

--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -19,8 +19,13 @@ import contextlib
 from paddle.fluid.framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, dygraph_only, set_flags, get_flags
 import warnings
 import copy
+import functools
+import paddle
+import operator
+import types
+import paddle.fluid as fluid

-__all__ = ['amp_guard']
+__all__ = ['amp_guard', 'amp_decorate']

 # The set of ops that support fp16 calculation and are considered numerically-
 # safe and performance-critical. These ops are always converted to fp16.
@@ -64,15 +69,22 @@ AMP_RELATED_FLAGS_SETTING = {
    'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
 }

+PURE_FP16_BLACK_LIST = {' '}
+PURE_FP16_WHITE_LIST = {'lookup_table', 'lookup_table_v2'}
+

 #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
 # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
-def _update_list(custom_white_list, custom_black_list):
+def _update_list(custom_white_list, custom_black_list, level='O1'):
    """
    Update black and white list according to users' custom list.
    """
-    _white_list = copy.copy(WHITE_LIST)
-    _black_list = copy.copy(BLACK_LIST)
+    if level == 'O1':
+        _white_list = copy.copy(WHITE_LIST)
+        _black_list = copy.copy(BLACK_LIST)
+    else:
+        _white_list = copy.copy(PURE_FP16_WHITE_LIST)
+        _black_list = copy.copy(PURE_FP16_BLACK_LIST)
    if custom_white_list and custom_black_list:
        for op_name in custom_white_list:
            if op_name in custom_black_list:
@@ -97,28 +109,111 @@ def _in_amp_guard():
    """
    tracer = _dygraph_tracer()
    if tracer:
-        return tracer._enable_autocast
+        if tracer._amp_level == 1:
+            return True
+        else:
+            return False
    else:
        return False


+@dygraph_only
+def pure_fp16_initialize(enable_pure_fp16, models, optimizers):
+    if not enable_pure_fp16:
+        return models, optimizers
+
+    for idx in range(len(models)):
+        for layer in models[idx].sublayers(include_self=True):
+            layer._casted_by_pure_fp16 = True
+            if len(layer._sub_layers) is 0:
+
+                if (layer._dtype is 'float16') or isinstance(layer, (
+                        paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
+                    continue
+                layer.to(dtype='float16')
+
+    for idx_opt in range(len(optimizers)):
+        # update _param_groups
+        if getattr(optimizers[idx_opt], '_param_groups', None) and isinstance(
+                optimizers[idx_opt]._param_groups[0], dict):
+            for param_group in optimizers[idx_opt]._param_groups:
+                for i, param in enumerate(param_group['params']):
+                    for idx_model in range(len(models)):
+                        for layer in models[idx_model].sublayers(
+                                include_self=True):
+                            if id(param) in layer._parameters_transform_map:
+                                param_group['params'][
+                                    i] = layer._parameters_transform_map[id(
+                                        param)][0]
+            for param_group in optimizers[idx_opt]._parameter_list:
+                params = param_group['params']
+                for i, param in enumerate(params):
+                    for idx_model in range(len(models)):
+                        for layer in models[idx_model].sublayers(
+                                include_self=True):
+                            if id(param) in layer._parameters_transform_map:
+                                params[i] = layer._parameters_transform_map[id(
+                                    param)][0]
+        # update _parameter_list
+        else:
+            for i, param in enumerate(optimizers[idx_opt]._parameter_list):
+                for idx_model in range(len(models)):
+                    for layer in models[idx_model].sublayers(include_self=True):
+                        if id(param) in layer._parameters_transform_map:
+                            optimizers[idx_opt]._parameter_list[
+                                i] = layer._parameters_transform_map[id(param)][
+                                    0]
+                            if hasattr(optimizers[idx_opt], '_param_groups'):
+                                optimizers[idx_opt]._param_groups[
+                                    i] = layer._parameters_transform_map[id(
+                                        param)][0]
+    return models, optimizers
+
+
+def check_models(models):
+    for model in models:
+        if not isinstance(model, paddle.nn.Layer):
+            raise RuntimeError(
+                "Current train mode is pure fp16, models should be paddle.nn.Layer, but receive {}.".
+                format(type(model)))
+
+
+def check_optimizers(optimizers):
+    for optimizer in optimizers:
+        if not isinstance(optimizer, (paddle.optimizer.Optimizer,
+                                      paddle.fluid.optimizer.Optimizer)):
+            raise RuntimeError(
+                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".
+                format(type(optimizer)))
+
+
 @signature_safe_contextmanager
 @dygraph_only
-def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
+def amp_guard(enable=True,
+              custom_white_list=None,
+              custom_black_list=None,
+              level='O1'):
    """
    :api_attr: imperative

-    Create a context which enables auto-mixed-precision(AMP) of operators executed in imperative mode.
+    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
    If enabled, the input data type (float32 or float16) of each operator is decided 
    by autocast algorithm for better performance. 
    
-    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
-    imperative mode.
+    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
+    imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.

    Args:
        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
-        custom_white_list(set|list, optional): The custom white_list.
-        custom_black_list(set|list, optional): The custom black_list.
+        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
+             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+             will be converted to fp16.
+        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
+             calculation and are considered numerically-dangerous and whose effects may also be 
+             observed in downstream ops. These ops will not be converted to fp16.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+             O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
+
        
    Examples:

@@ -139,6 +234,11 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
                print(conv.dtype) # FP32

    """
+    if not (level in ['O1', 'O2']):
+        raise ValueError(
+            "level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
+        )
+
    tracer = _dygraph_tracer()
    if not tracer:
        raise ValueError(
@@ -151,17 +251,27 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
            % tracer._expected_place)
        enable = False

-    # use default white_list and black_list if no custom lists provided
-    _white_list = WHITE_LIST
-    _black_list = BLACK_LIST
+    if level == 'O1':
+        amp_level = 1
+        _white_list = WHITE_LIST
+        _black_list = BLACK_LIST
+    else:
+        amp_level = 2
+        _white_list = PURE_FP16_WHITE_LIST
+        _black_list = PURE_FP16_BLACK_LIST
+
    if custom_white_list or custom_black_list:
        _white_list, _black_list = _update_list(custom_white_list,
-                                                custom_black_list)
+                                                custom_black_list, level)
+
+    if not enable:
+        amp_level = 0

    if tracer:
        # enable auto_cast
-        original_enable = tracer._enable_autocast
-        tracer._enable_autocast = enable
+        original_amp_level = tracer._amp_level
+        tracer._amp_level = amp_level
+
        # set amp op list
        original_white_list, original_black_list = tracer._get_amp_op_list()
        tracer._set_amp_op_list(_white_list, _black_list)
@@ -179,6 +289,141 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
        yield
    finally:
        if tracer:
-            tracer._enable_autocast = original_enable
+            tracer._amp_level = original_amp_level
            tracer._set_amp_op_list(original_white_list, original_black_list)
            # set_flags(original_flags)
+
+
+class StateDictHook(object):
+    def __init__(self, save_dtype):
+        self._save_dtype = save_dtype
+
+    def __call__(self, state_dict):
+        for key in state_dict:
+            param = state_dict[key]
+            with fluid.dygraph.guard():
+                param_applied = paddle.cast(param, self._save_dtype)
+                param_applied.name = param.name
+                state_dict[key] = param_applied
+
+
+@dygraph_only
+def amp_decorate(models,
+                 optimizers=None,
+                 level='O1',
+                 master_weight=None,
+                 save_dtype=None):
+    """
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
+    
+    Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
+
+    Args:
+        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
+        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+             O2 represent Pure fp16, the decorator will cast all parameters of models to FP16, except BatchNorm and LayerNorm. Default is O1(amp)
+        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
+        save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, float32, float64 or None.
+             The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
+
+    Examples:
+
+     .. code-block:: python   
+        
+        # required: gpu
+        # Demo1: single model and optimizer:
+        import paddle
+        import paddle.fluid as fluid
+
+        model = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimzier = paddle.optimizer.SGD(parameters=model.parameters())
+
+        model, optimizer = fluid.dygraph.amp_decorate(models=model, optimizers=optimzier, level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
+
+        with fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = model(data)
+            print(output.dtype) # FP16
+
+        # required: gpu
+        # Demo2: multi models and optimizers:
+        model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
+        optimizer2 = paddle.optimizer.Adam(parameters=model2.parameters())
+
+        models, optimizers = fluid.dygraph.amp_decorate(models=[model, model2], optimizers=[optimzier, optimizer2], level='O2')
+
+        data = paddle.rand([10, 3, 32, 32])
+
+        with fluid.dygraph.amp_guard(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
+            output = models[0](data)
+            output2 = models[1](data)
+            print(output.dtype) # FP16
+            print(output2.dtype) # FP16
+    """
+    if not (level in ['O1', 'O2']):
+        raise ValueError(
+            "level should be O1 or O2, O1 represent AMP train mode, O2 represent Pure fp16 train mode."
+        )
+
+    if level == 'O1':
+        return models, optimizers
+
+    models_is_list = False
+    if isinstance(models, paddle.nn.Layer):
+        models_is_list = False
+        models = [models]
+        check_models(models)
+    elif isinstance(models, list):
+        check_models(models)
+        models_is_list = True
+    else:
+        raise TypeError(
+            "models must be either a single model or a list of models.")
+
+    optimizers_is_list = False
+    if isinstance(optimizers, (paddle.optimizer.Optimizer,
+                               paddle.fluid.optimizer.Optimizer)):
+        optimizers_is_list = False
+        optimizers = [optimizers]
+        check_optimizers(optimizers)
+    elif isinstance(optimizers, list):
+        check_optimizers(optimizers)
+        optimizers_is_list = True
+    else:
+        raise TypeError(
+            "optimizers must be either a single optimizer or a list of optimizers."
+        )
+
+    models, optimizers = pure_fp16_initialize(
+        enable_pure_fp16=True, models=models, optimizers=optimizers)
+
+    # supprot master_weight    
+    for idx_opt in range(len(optimizers)):
+        if hasattr(optimizers[idx_opt], '_multi_precision'):
+            if master_weight is False:
+                optimizers[idx_opt]._multi_precision = False
+            else:
+                optimizers[idx_opt]._multi_precision = True
+
+    if save_dtype is not None:
+        if not (save_dtype in ['float16', 'float32', 'float64']):
+            raise ValueError(
+                "save_dtype can only be float16 float32 or float64, but your input save_dtype is %s."
+                % save_dtype)
+        for idx in range(len(models)):
+            for layer in models[idx].sublayers(include_self=True):
+                layer.register_state_dict_hook(StateDictHook(save_dtype))
+
+    if models_is_list:
+        if optimizers_is_list:
+            return models, optimizers
+        else:
+            return models, optimizers[0]
+    else:
+        if optimizers_is_list:
+            return models[0], optimizers
+        else:
+            return models[0], optimizers[0]
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -216,17 +216,45 @@ class AmpScaler(object):
        if getattr(optimizer, '_param_groups', None) and isinstance(
                optimizer._param_groups[0], dict):
            param_grads = []
+            param_grads_fp16 = []
+            param_grads_fp32 = []
            for group in optimizer._param_groups:
                for param in group['params']:
                    if param._grad_ivar() is not None:
                        param_grads.append(param._grad_ivar())
+                        if param._grad_ivar(
+                        ).dtype == core.VarDesc.VarType.FP16:
+                            param_grads_fp16.append(param._grad_ivar())
+                        else:
+                            param_grads_fp32.append(param._grad_ivar())
        else:
            param_grads = [
                param._grad_ivar() for param in optimizer._parameter_list
                if param._grad_ivar() is not None
            ]
-        _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
-                                        self._found_inf)
+            param_grads_fp16 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
+                           )
+            ]
+            param_grads_fp32 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
+                           )
+            ]
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        if len(param_grads_fp16):
+            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                            param_grads_fp16,
+                                            temp_found_inf_fp16)
+        if len(param_grads_fp32):
+            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                            param_grads_fp32,
+                                            temp_found_inf_fp32)
+        self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32

    def _update(self):
        """

--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -779,10 +779,11 @@ def save(layer, path, input_spec=None, **configs):

        dygraph_state_dict = None
        if isinstance(inner_layer, Layer):
-            dygraph_state_dict = inner_layer.state_dict()
+            dygraph_state_dict = inner_layer.to_static_state_dict()
        elif isinstance(attr_func, StaticFunction):
            if attr_func._class_instance:
-                dygraph_state_dict = attr_func._class_instance.state_dict()
+                dygraph_state_dict = attr_func._class_instance.to_static_state_dict(
+                )

        if dygraph_state_dict:
            # NOTE(chenweihang): we maintain the mapping of variable name to
@@ -790,15 +791,19 @@ def save(layer, path, input_spec=None, **configs):
            # saved to inference program may not need by dygraph Layer,
            # we only record the state_dict variable's structured name
            state_names_dict = dict()
+            state_var_dict = dict()
            for structured_name, var in six.iteritems(dygraph_state_dict):
                state_names_dict[var.name] = structured_name
+                state_var_dict[var.name] = var

            # 3. share parameters from Layer to scope & record var info
            for param_or_buffer in concrete_program.parameters:
                # share to scope
                param_or_buffer_tensor = scope.var(
                    param_or_buffer.name).get_tensor()
-                src_tensor = param_or_buffer.value().get_tensor()
+                #src_tensor = param_or_buffer.value().get_tensor()
+                src_tensor = state_var_dict[param_or_buffer.name].value(
+                ).get_tensor()
                param_or_buffer_tensor._share_data_with(src_tensor)
                # record var info
                if param_or_buffer.name not in extra_var_info:

--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -121,6 +121,13 @@ class Layer(core.Layer):
        self._forward_pre_hooks = collections.OrderedDict()
        self._forward_post_hooks = collections.OrderedDict()

+        self._parameters_transform_map = {}
+        self._buffers_transform_map = {}
+
+        self._casted_by_pure_fp16 = False
+
+        self._state_dict_hooks = collections.OrderedDict()
+
    def train(self):
        """
        Sets this Layer and all its sublayers to training mode.
@@ -1259,6 +1266,87 @@ class Layer(core.Layer):
        final_str += ')'
        return final_str

+    def register_state_dict_hook(self, hook):
+        hook_remove_helper = HookRemoveHelper(self._state_dict_hooks)
+        self._state_dict_hooks[hook_remove_helper._hook_id] = hook
+        return hook_remove_helper
+
+    def _state_dict_impl(self,
+                         destination=None,
+                         include_sublayers=True,
+                         structured_name_prefix="",
+                         include_non_persistable_buffer=False):
+        """
+        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
+
+        Parameters:
+            destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
+            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
+            include_non_persistable_buffer(bool, optional): If true, include non persistable buffers of current layer and its sub-layers, it is used in pure fp16 and jit.save. Default: False
+        """
+
+        if destination is None:
+            destination = collections.OrderedDict()
+        for name, data in self._parameters.items():
+            if data is not None:
+                destination[structured_name_prefix + name] = data
+        for name, buffer in self._buffers.items():
+            if not include_non_persistable_buffer:
+                if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                    destination[structured_name_prefix + name] = buffer
+            else:
+                if buffer is not None:
+                    destination[structured_name_prefix + name] = buffer
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    destination_temp = destination.copy()
+                    destination_temp.update(
+                        layer_item._state_dict_impl(
+                            destination_temp, include_sublayers,
+                            structured_name_prefix + layer_name + ".",
+                            include_non_persistable_buffer))
+                    destination = destination_temp
+
+        for state_dict_hook in self._state_dict_hooks.values():
+            hook_result = state_dict_hook(destination)
+            if hook_result is not None:
+                destination = hook_result
+
+        return destination
+
+    def to_static_state_dict(self,
+                             destination=None,
+                             include_sublayers=True,
+                             structured_name_prefix=""):
+        '''
+        Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
+
+        Parameters:
+            destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
+            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
+            
+        Retruns:
+            dict: a dict contains all the parameters and persistable buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                emb = paddle.nn.Embedding(10, 10)
+
+                state_dict = emb.to_static_state_dict()
+                paddle.save( state_dict, "paddle_dy.pdparams")
+
+        '''
+        return self._state_dict_impl(
+            destination=destination,
+            include_sublayers=include_sublayers,
+            structured_name_prefix=structured_name_prefix,
+            include_non_persistable_buffer=True)
+
    def state_dict(self,
                   destination=None,
                   include_sublayers=True,
@@ -1269,7 +1357,7 @@ class Layer(core.Layer):
        Parameters:
            destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-
+            
        Retruns:
            dict: a dict contains all the parameters and persistable buffers.

@@ -1284,26 +1372,11 @@ class Layer(core.Layer):
                paddle.save( state_dict, "paddle_dy.pdparams")

        '''
-
-        if destination is None:
-            destination = collections.OrderedDict()
-        for name, data in self._parameters.items():
-            if data is not None:
-                destination[structured_name_prefix + name] = data
-        for name, buffer in self._buffers.items():
-            if buffer is not None and name not in self._non_persistable_buffer_names_set:
-                destination[structured_name_prefix + name] = buffer
-
-        if include_sublayers:
-            for layer_name, layer_item in self._sub_layers.items():
-                if layer_item is not None:
-                    destination_temp = destination.copy()
-                    destination_temp.update(
-                        layer_item.state_dict(
-                            destination_temp, include_sublayers,
-                            structured_name_prefix + layer_name + "."))
-                    destination = destination_temp
-        return destination
+        return self._state_dict_impl(
+            destination=destination,
+            include_sublayers=include_sublayers,
+            structured_name_prefix=structured_name_prefix,
+            include_non_persistable_buffer=False)

    @framework.deprecate_stat_dict
    def set_state_dict(self, state_dict, use_structured_name=True):
@@ -1404,8 +1477,11 @@ class Layer(core.Layer):
                        ).stop_gradient
                        self._parameters[key]._set_grad_ivar(grad_applied)

+            self._parameters_transform_map[id(param)] = [param_applied, key]
+
        for key, buf in self._buffers.items():
            self._buffers[key] = func(buf, device, dtype, blocking)
+            self._buffers_transform_map[id(buf)] = [self._buffers[key], key]

    def to(self, device=None, dtype=None, blocking=None):
        '''
@@ -1501,6 +1577,7 @@ class Layer(core.Layer):
            return new_t

        self._apply(transform, device, dtype, blocking)
+        self._dtype = dtype

    # [aliases] Compatible with old method names
    set_dict = set_state_dict

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1433,12 +1433,12 @@ class MomentumOptimizer(Optimizer):
        velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                             param_and_grad[0])
        lr = self._create_param_lr(param_and_grad)
-
+        master_weight = None
        if framework.in_dygraph_mode():
-            _, _ = _C_ops.momentum(param_and_grad[0], param_and_grad[1],
-                                   velocity_acc, lr, param_and_grad[0],
-                                   velocity_acc, 'mu', self._momentum,
-                                   'use_nesterov', self._use_nesterov)
+            _, _, _ = _C_ops.momentum(
+                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov)
            return None

        attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
@@ -1982,26 +1982,29 @@ class LarsMomentumOptimizer(Optimizer):
        self._master_weights = {}

    def _create_master_weight(self, param):
-        assert isinstance(self.helper, LayerHelper)
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)

-        var_name = param.name + '_fp32_master'
-        var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
-        block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
-        self._master_weights[param.name] = var
+            var_name = param.name + '_fp32_master'
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
        return var

    def _get_accumulator(self, name, param):
@@ -2462,12 +2465,14 @@ class AdamOptimizer(Optimizer):
                self._beta1, Variable) else self._beta1.numpy().item(0)
            _beta2 = self._beta2 if not isinstance(
                self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _ = _C_ops.adam(
+            master_weight = None
+            _, _, _, _, _, _ = _C_ops.adam(
                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
-                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
-                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2, 'use_global_beta_pow',
+                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
+                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
+                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
+                'beta2', _beta2, 'use_global_beta_pow',
                self._use_global_beta_pow)

            return None

--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -1099,7 +1099,6 @@ class TestJitSaveLoadSaveWithoutRunning(unittest.TestCase):
                paddle.static.InputSpec(
                    shape=[None, IMAGE_SIZE], dtype='float32')
            ])
-
        result_00 = layer_save(inps0)
        result_01 = layer_save(inps1)
        #load and save without running

--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -22,6 +22,8 @@ from ..fluid.layer_helper import LayerHelper
 import warnings
 from ..fluid.dygraph import base as imperative_base
 from collections import defaultdict
+import numpy as np
+import time

 import paddle
 from paddle import _C_ops
@@ -208,26 +210,29 @@ class Adam(Optimizer):
        }

    def _create_master_weight(self, param):
-        assert isinstance(self.helper, LayerHelper)
-
-        var_name = param.name + "_fp32_master"
-        var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
-        block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
-        self._master_weights[param.name] = var
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
        return var

    def _get_accumulator(self, name, param):
@@ -317,12 +322,13 @@ class Adam(Optimizer):
                self._beta1, Variable) else self._beta1.numpy().item(0)
            _beta2 = self._beta2 if not isinstance(
                self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _ = _C_ops.adam(
+            _, _, _, _, _, _ = _C_ops.adam(
                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
-                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
-                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2)
+                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
+                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
+                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
+                'beta2', _beta2, 'multi_precision', find_master)

            return None


--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -297,13 +297,15 @@ class AdamW(Adam):
                self._beta1, Variable) else self._beta1.numpy().item(0)
            _beta2 = self._beta2 if not isinstance(
                self._beta2, Variable) else self._beta2.numpy().item(0)
-            _, _, _, _, _ = _C_ops.adamw(
+
+            _, _, _, _, _, _ = _C_ops.adam(
                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1,
-                moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon,
-                'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread',
-                1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff,
-                "lr_ratio", lr_ratio_)
+                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
+                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
+                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
+                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
+                'beta2', _beta2, 'coeff', self._coeff, 'multi_precision',
+                find_master)

            return None


--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -170,7 +170,7 @@ class Momentum(Optimizer):
            'regularization_method': self._regularization_method,
            'regularization_coeff': self._regularization_coeff,
        }
-
+        '''
        if framework.in_dygraph_mode():
            self.helper = LayerHelper(self.__class__.__name__)
            if isinstance(self._parameter_list[0], dict):
@@ -180,6 +180,7 @@ class Momentum(Optimizer):
            else:
                for p in parameters:
                    self._add_accumulator(self._velocity_acc_str, p)
+        '''

    def _update_regularization(self, weight_decay):
        reg_method = ""
@@ -194,26 +195,29 @@ class Momentum(Optimizer):
        return reg_method, reg_coeff

    def _create_master_weight(self, param):
-        assert isinstance(self.helper, LayerHelper)
-
-        var_name = param.name + "_fp32_master"
-        var_name = unique_name.generate(var_name)
-        var = layers.create_global_var(
-            name=var_name,
-            shape=param.shape,
-            value=0,
-            dtype='float32',
-            persistable=True)
-        block = self.helper.startup_program.global_block()
-        block.append_op(
-            type="cast",
-            inputs={"X": [param]},
-            outputs={"Out": [var]},
-            attrs={
-                "in_dtype": param.dtype,
-                "out_dtype": core.VarDesc.VarType.FP32
-            })
-        self._master_weights[param.name] = var
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
        return var

    def _get_accumulator(self, name, param):
@@ -239,10 +243,15 @@ class Momentum(Optimizer):
        return self._accumulators[name][target_name]

    def _create_accumulators(self, block, parameters):
+        '''
        if framework.in_dygraph_mode():
            return
-
+        '''
        assert isinstance(block, framework.Block)
+
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
        for p in parameters:
            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
                master_p = self._create_master_weight(p)
@@ -291,21 +300,23 @@ class Momentum(Optimizer):
                regularization_method = ""
                regularization_coeff = 0

+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
+
        if framework.in_dygraph_mode():
            if isinstance(param_and_grad, dict):
                self._update_regularization(param_and_grad['weight_decay'])
-            _, _ = _C_ops.momentum(
+            _, _, _ = _C_ops.momentum(
                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                param_and_grad[0], velocity_acc, 'mu', self._momentum,
-                'use_nesterov', self._use_nesterov, 'regularization_method',
-                regularization_method, 'regularization_coeff',
-                regularization_coeff)
-            return None
+                master_weight, param_and_grad[0], velocity_acc, master_weight,
+                'mu', self._momentum, 'use_nesterov', self._use_nesterov,
+                'regularization_method', regularization_method,
+                'regularization_coeff', regularization_coeff, 'multi_precision',
+                find_master)

-        find_master = self._multi_precision and param_and_grad[
-            0].dtype == core.VarDesc.VarType.FP16
-        master_weight = (self._master_weights[param_and_grad[0].name]
-                         if find_master else None)
+            return None

        attrs = {
            "mu": self._momentum,