[dygraph qat] Use layer to calculate output scale (#31861)

* Use layer to calculate output scale * add backward for moving_average_abs_max_scale and save output scales to op's attr

[dygraph qat] Use layer to calculate output scale (#31861)
* Use layer to calculate output scale * add backward for moving_average_abs_max_scale and save output scales to op's attr
b47478ef · cc · GitHub · c3974d0e · b47478ef · b47478ef
9 changed file
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -649,13 +649,18 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
                   "MovingAverageAbsMaxScale");
    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
                   "MovingAverageAbsMaxScale");
+
    if (ctx->HasOutput("OutState")) {
      ctx->SetOutputDim("OutState", {1});
    }
    if (ctx->HasOutput("OutAccum")) {
      ctx->SetOutputDim("OutAccum", {1});
    }
+    if (ctx->HasOutput("Out")) {
+      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
      ctx->SetOutputDim("OutScale", {1});
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
  }

 protected:
@@ -673,6 +678,9 @@ class MovingAverageAbsMaxScaleOpMaker
    AddInput("X", "(Tensor) Input is float data type.");
    AddInput("InAccum", "Last accum.").AsDispensable();
    AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out",
+              "(Tensor) Output tensor is just equivalent to the input tensor.")
+        .AsDispensable();
    AddOutput("OutScale", " Current scale");
    AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
    AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
@@ -693,7 +701,7 @@ $$Out = X$$
  }
 };

-class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
+class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -701,9 +709,9 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
    auto out_grad_name = framework::GradVarName("Out");
    auto x_grad_name = framework::GradVarName("X");
    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");

    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
  }
@@ -717,13 +725,13 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
 };

 template <typename T>
-class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker<T> {
+class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
 public:
  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;

 protected:
  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fake_quantize_dequantize_grad");
+    grad_op->SetType("stright_throuth_estimator_grad");
    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    grad_op->SetAttrMap(this->Attrs());
@@ -744,11 +752,11 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);

-REGISTER_OPERATOR(fake_quantize_dequantize_abs_max,
-                  ops::FakeQuantOrWithDequantAbsMaxOp,
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
                       ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);

@@ -769,11 +777,12 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
                       ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);

-REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_moving_average_abs_max,
    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
    fake_quantize_dequantize_moving_average_abs_max,
    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
@@ -789,20 +798,22 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
 REGISTER_OPERATOR(
    moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
    ops::MovingAverageAbsMaxScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
                       ops::MovingAverageAbsMaxScaleKernel<CPU, float>);

-REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
-REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
-                       ops::FakeQuantDequantGradKernel<CPU, float>);
+REGISTER_OPERATOR(stright_throuth_estimator_grad,
+                  ops::StrightThroughEstimatorGradOp);
+REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad,
+                       ops::StrightThroughEstimatorGradKernel<CPU, float>);

-REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
+REGISTER_OPERATOR(
+    fake_channel_wise_quantize_dequantize_abs_max,
    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
    fake_channel_wise_quantize_dequantize_abs_max,
    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
@@ -820,4 +831,8 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale)
            "Out",
            "Delete output in order to make the inference model not "
            "save moving_average_abs_max_scale operator. This will "
-            "make the quantitative model be correctly applied in inference."));
+            "make the quantitative model be correctly applied in inference."))
+    .AddCheckpoint(
+        R"ROC(Incompatible upgrade of output [Out])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "Out", "In order to support dygraph qat, add output again."));
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -543,8 +543,8 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
 REGISTER_OP_CUDA_KERNEL(
    fake_quantize_dequantize_moving_average_abs_max,
    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
-                        ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad,
+                        ops::StrightThroughEstimatorGradKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(
    fake_channel_wise_quantize_dequantize_abs_max,
    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -314,6 +314,12 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
    auto* in = context.Input<framework::Tensor>("X");
    auto& dev_ctx = context.template device_context<DeviceContext>();

+    if (context.HasOutput("Out")) {
+      auto* out = context.Output<framework::Tensor>("Out");
+      out->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+    }
+
    bool is_test = context.Attr<bool>("is_test");
    // testing
    if (is_test) {
@@ -344,16 +350,16 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 };

 template <typename DeviceContext, typename T>
-class FakeQuantDequantGradKernel : public framework::OpKernel<T> {
+class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* d_out =
        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    auto x_grad_name = framework::GradVarName("X");
    auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        d_x, platform::errors::PreconditionNotMet(
-                 "FakeQuantDequantGradOp doesn't have the output named %s.",
+    PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet(
+                                     "StrightThroughEstimatorGradKernel "
+                                     "doesn't have the output named %s.",
                                     x_grad_name));

    // Initialize dx as same as d_out

--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -84,7 +84,8 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
    {"matrix_nms", {"Out", "Index", "RoisNum"}},
    {"distribute_fpn_proposals",
     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
    {"momentum", {"ParamOut", "VelocityOut"}},
@@ -137,7 +138,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
    {"update_loss_scaling",
     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
    {"lamb",
     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
    {"rnn", {"DropoutState"}},

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -21,14 +21,14 @@ import warnings

 import paddle
 from paddle.fluid import dygraph, core, framework, unique_name
-from paddle.fluid.executor import Executor
+from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.io import load_inference_model, save_inference_model
 from paddle.fluid.log_helper import get_logger
-from . import quant_nn
 from .. import quantization_pass
+from . import quant_nn
 from . import utils

 __all__ = ['ImperativeQuantAware']
@@ -201,7 +201,7 @@ class ImperativeQuantAware(object):

        self._quantize_inputs = ImperativeQuantizeInputs(**kwargs)

-        self._calc_output_scale = ImperativeCalcOutputScale()
+        self._quantize_outputs = ImperativeQuantizeOutputs()

    def quantize(self, model):
        """
@@ -219,10 +219,10 @@ class ImperativeQuantAware(object):
        assert isinstance(model, dygraph.Layer), \
            "The model must be the instance of dygraph.Layer."
        self._quantize_inputs.apply(model)
-        self._calc_output_scale.apply(model)
+        self._quantize_outputs.apply(model)

    def save_quantized_model(self, layer, path, input_spec=None, **config):
-        self._calc_output_scale.save_quantized_model(layer, path, input_spec,
+        self._quantize_outputs.save_quantized_model(layer, path, input_spec,
                                                    **config)


@@ -323,10 +323,10 @@ class ImperativeQuantizeInputs(object):
                idx += 1
            target = name[last_idx:idx]

-            quant_layer = self._get_quantized_layer(layer)
+            quant_layer = self._get_input_quantized_layer(layer)
            setattr(obj, target, quant_layer)

-    def _get_quantized_layer(self, layer):
+    def _get_input_quantized_layer(self, layer):
        quant_layer_name = None
        for key, value in utils.quant_input_layers_map.items():
            if isinstance(layer, value):
@@ -343,24 +343,26 @@ class ImperativeQuantizeInputs(object):
        return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)


-class ImperativeCalcOutputScale(object):
+class ImperativeQuantizeOutputs(object):
+    """
+    Calculate the output scales for some layers.
+    """
+
    def __init__(self, moving_rate=0.9):
        """
-        Add the logic of calculating and setting output scales of some layers. 
+        The constructor for ImperativeQuantizeOutputs.

        Args:
            moving_rate(float): The decay coefficient of moving average.
                                The default value is 0.9.
        """
-        super(ImperativeCalcOutputScale, self).__init__()
+        super(ImperativeQuantizeOutputs, self).__init__()
        self._moving_rate = moving_rate
-        self._register_hook_handle_list = []
-        self._out_scale_dict = collections.OrderedDict()

    def apply(self, model):
        """
-        Insert the `moving_average_abs_max_scale` op to calculate output 
-        scale of specific layers in model.
+        Insert the `moving_average_abs_max_scale` layers to calculate the
+        output scales for specific layers in the dygraph model.

        Args:
            model(fluid.dygraph.Layer): The target model which would be
@@ -372,14 +374,25 @@ class ImperativeCalcOutputScale(object):
        assert isinstance(model, dygraph.Layer), \
            "The model must be the instance of dygraph.Layer."

-        # Calculate the target ops's output scale, and don't consider
-        # the skip_quant attr
-        for _, layer in model.named_sublayers():
-            if self._is_target_layer(layer):
-                self._init_scale_params(layer)
-                hook_handle = layer.register_forward_post_hook(
-                    self._calc_output_scale_hook)
-                self._register_hook_handle_list.append(hook_handle)
+        for name, layer in model.named_sublayers():
+            if not self._is_target_layer(layer):
+                continue
+
+            # TODO(jc): optimize this module
+            last_idx = 0
+            idx = 0
+            obj = model
+            while idx < len(name):
+                if (name[idx] == '.'):
+                    if hasattr(obj, name[last_idx:idx]):
+                        obj = getattr(obj, name[last_idx:idx])
+                        last_idx = idx + 1
+                idx += 1
+            target = name[last_idx:idx]
+
+            quant_layer = quant_nn.__dict__["QuantizedOutputLayer"](
+                layer, self._moving_rate)
+            setattr(obj, target, quant_layer)

    def save_quantized_model(self, layer, path, input_spec=None, **config):
        """
@@ -409,33 +422,18 @@ class ImperativeCalcOutputScale(object):
        Returns:
            None
        """
-
        assert isinstance(layer, dygraph.Layer), \
            "The model must be the instance of dygraph.Layer."

-        self._gather_output_scale(layer)
-
-        with dygraph.guard():
-            layer.eval()
-            for handle in self._register_hook_handle_list:
-                handle.remove()
        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)

-        if len(self._out_scale_dict) == 0:
-            warnings.warn("Warning: No Layer of the model while to be " \
-                          "saved contains the out_threshold attribute, so the " \
-                          "generated inference model would not contain the " \
-                          "out_threshold.")
-            return
-
-        # load static model
        is_dynamic_mode = False
        if paddle.in_dynamic_mode():
            is_dynamic_mode = True
            paddle.enable_static()

-        place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \
-            else core.CPUPlace()
+        place = core.CPUPlace()
+        scope = global_scope()
        exe = Executor(place)

        dirname = os.path.dirname(path)
@@ -450,20 +448,10 @@ class ImperativeCalcOutputScale(object):
                model_filename=model_filename,
                params_filename=params_filename))

-        # TODO(jc): analyse whether the dygraph model has
-        # several blocks before applying qat
-        assert infer_program.num_blocks == 1, \
-            "Quantization aware training (QAT) requires the program " \
-            "only has a block for now. When the model has if-else or " \
-            "while, the program will have several blocks."
-
-        # set output scales to the static model
-        self._save_output_scale(infer_program)
+        self._save_output_scale(infer_program, scope)

-        # process skip quant
        self._set_skip_quant_attr(infer_program)

-        # save the final quantized model that has output scales
        save_inference_model(
            dirname=dirname,
            feeded_var_names=feed_target_names,
@@ -476,141 +464,39 @@ class ImperativeCalcOutputScale(object):
        if is_dynamic_mode:
            paddle.disable_static()

-    def _gather_output_scale(self, layer):
-        """
-        Gather all output scales to self._out_scale_dict
+    def _is_target_layer(self, layer):
        """
-        with dygraph.guard():
-            layer.eval()
-            for _, sub_layer in layer.named_sublayers():
-                if self._is_target_layer(sub_layer):
-                    layer_name = sub_layer.full_name()
-                    if hasattr(sub_layer, "_quant_out_scale"):
-                        self._out_scale_dict[layer_name] = float(
-                            sub_layer._quant_out_scale)
-
-    def _save_output_scale(self, infer_program):
+        Whether the layer needs to calculate output scales.
        """
-        Save all output scales to the corresponding ops in static
-        inference program.
-
-        Because the Layer in dygraph may correspond to multiple ops
-        in static program after being saved. To ensure correctness,
-        the outscale collected for output of dygraph Layer can only
-        be set to the last op in the corresponding ops in static program.
-        """
-        assert infer_program.num_blocks == 1, \
-            "The inference program should only have a block."
-
-        global_block = infer_program.global_block()
-        target_ops = global_block.ops
-
-        scale_idx = 0
-        op_idx = 0
-        attr_name = "out_threshold"
-
-        for scale_name, scale_value in self._out_scale_dict.items():
-            while True:
-                if op_idx >= len(target_ops):
-                    break
-
-                op = target_ops[op_idx]
-                if not self._is_scale_op_matched(scale_name, op, global_block):
-                    op_idx += 1
-                else:
-                    if op.type in utils.weight_op_types \
-                        and op_idx + 1 < len(target_ops) \
-                        and target_ops[op_idx+1].type == "elementwise_add":
-                        target_ops[op_idx + 1]._set_attr(attr_name, scale_value)
-                        op_idx += 2
-                    else:
-                        op._set_attr(attr_name, scale_value)
-                        op_idx += 1
-                    scale_idx += 1
-                    break
-
-        if scale_idx != len(self._out_scale_dict):
-            _logger.warning("Warning: the model have %s output scales, "\
-                "but it only saves %s output scales." \
-                % (len(self._out_scale_dict), scale_idx))
-
-    def _is_target_layer(self, layer):
        return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \
-            or ('quantized_' in layer.full_name() and \
+            or ('quantized' in layer.full_name() and \
                'quantized_noweight' not in layer.full_name())

-    def _init_scale_params(self, layer, name=None):
+    def _save_output_scale(self, program, scope):
        """
-        Init the scale params for calculating output scales and save them in the
-        target layer.
-        After the users define the dygraph model, the hooks for calculating output
-        scales will not execute immediately. If the users load parameters form
-        checkpoint and save the quantized inference model immediately, the inference
-        model would not be saved successfully. Beacuse the dygraph_to_static requires
-        that the parameters created in __init__, but the uniqueness of hook make it
-        impossible to create parameters in __init__. To avoid this mistake, we define
-        the scale parameters in the beginning instead of hook.
+        Save all output scales to the corresponding ops in static
+        inference program and delete 'moving_average_abs_max_scale' ops.
        """
+        for block in program.blocks:
+            for op in block.ops:
+                if op.type == "moving_average_abs_max_scale":
+                    in_var_name = op.input('X')[0]
+                    out_var_name = op.output('Out')[0]
+                    out_scale_name = op.output('OutScale')[0]

-        def _create_param(in_layer, first_name, last_name, dtype):
-            prefix = '{}.{}'.format(first_name, last_name) \
-                if first_name else 'outscale.{}'.format(last_name)
-            attr = ParamAttr(
-                name=unique_name.generate(prefix),
-                initializer=Constant(1),
-                trainable=False)
-            param = in_layer.create_parameter(shape=[1], attr=attr, dtype=dtype)
-            return param
-
-        dtype = layer._dtype if layer._dtype is not None else "float32"
-        if dtype not in ["float32", "float64"]:
-            return
-
-        layer._quant_out_scale = _create_param(layer, name, "scale", dtype)
-        layer._quant_out_scale.stop_gradient = True
-
-        layer._quant_out_state = _create_param(layer, name, "state", dtype)
-        layer._quant_out_state.stop_gradient = True
+                    out_scale = utils.load_variable_data(scope, out_scale_name)
+                    previous_op = utils.find_previous_op(block, in_var_name)
+                    previous_op._set_attr("out_threshold", float(out_scale))

-        layer._quant_out_accum = _create_param(layer, name, "accum", dtype)
-        layer._quant_out_accum.stop_gradient = True
+                    next_ops = utils.find_next_ops(block, out_var_name)
+                    for next_op in next_ops:
+                        next_op._rename_input(out_var_name, in_var_name)

-    def _is_scale_op_matched(self, scale_name, op, block):
+    def _set_skip_quant_attr(self, program):
        """
-        Based on the op name and attrs to judge whether the op in
-        program matches the scale_name. We must know the corresponding
-        name between dgraph and static model.
+        Label the skip quantized ops.
        """
-        fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]
-        if op.type in quantization_pass._op_real_in_out_name.keys():
-            output_var_names = quantization_pass._get_op_output_var_names(op)
-            for output_var_name in output_var_names:
-                output_var_tensor = block.var(output_var_name)
-                if output_var_tensor.dtype not in fp_type:
-                    return False
-
-        # corresponding_map: [name, op_types, function]
-        # Note that, the items have priority in corresponding_map
-        corresponding_map = [
-            ['conv2d_tranpose', ['conv2d_transpose', \
-                                'depthwise_conv2d_transpose'], None],
-            ['conv2d', ['conv2d', 'depthwise_conv2d'], None],
-            ['linear', ['matmul'], None],
-            ['re_lu6', ['relu6'], None],
-            ['p_re_lu', ['prelu'], None],
-            ['leaky_re_lu', ['leaky_relu'], None],
-            ['re_lu', ['relu'], None],
-        ]
-
-        for item in corresponding_map:
-            if item[0] in scale_name:
-                return (op.type in item[1]) and \
-                    (len(item) == 2 or item[2] is None or item[2](op))
-
-        return op.type in scale_name
-
-    def _set_skip_quant_attr(self, program):
-        block = program.global_block()
+        for block in program.blocks:
            for op in block.ops:
                if self._is_skip_quant_op(block, op):
                    op._set_attr("skip_quant", True)
@@ -621,33 +507,11 @@ class ImperativeCalcOutputScale(object):
        1. the type of input op should be conv2d, depthwise_conv2d or matmul
        2. the previous ops of the input op are not fake_quantize_dequantize ops
        """
-
-        def _find_previous_op(block, var_name):
-            for op in block.ops:
-                if var_name in op.output_arg_names:
-                    return op
-
        target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
        if in_op.type not in target_op_types:
            return False

-        previous_ops = [_find_previous_op(block, arg_name) \
+        previous_ops = [utils.find_previous_op(block, arg_name) \
            for arg_name in in_op.input_arg_names]
-        return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \
-            for op in previous_ops )
-
-    def _calc_output_scale_hook(self, layer, input, output):
-        """
-        Create the MovingAverageAbsMaxScale layer for the target layer if needed.
-        Execute MovingAverageAbsMaxScale layer to calculate the output scale. 
-        """
-        assert isinstance(output, (core.VarBase, framework.Variable)), \
-            "Multiple outputs are not currently supported in ImperativeOutScale."
-
-        fp_types = [core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64]
-        if output.dtype in fp_types:
-            if not hasattr(layer, "_out_scale"):
-                self._out_scale = quant_nn.MovingAverageAbsMaxScale(
-                    layer, output.name, self._moving_rate, output.dtype)
-            # TODO (jc): consider the ops that have several outputs 
-            self._out_scale(output)
+        return any(op is not None and op.type not in \
+            utils.fake_quantize_dequantize_types for op in previous_ops)
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -507,59 +507,42 @@ class QuantizedNoweightLayer(layers.Layer):


 class MovingAverageAbsMaxScale(layers.Layer):
-    def __init__(self, layer=None, name=None, moving_rate=0.9, dtype='float32'):
+    def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
        r"""
-        MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
-        Its computational formula is described as below:
+        MovingAverageMaxScale layer is used to calculating the output quantization
+        scale of Layer. Its computational formula is described as below:

        :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
        :math:`Out = X`
        """
        super(MovingAverageAbsMaxScale, self).__init__()
        self._moving_rate = moving_rate
-        self._dtype = dtype
-        self._layer = layer

-        if self._layer is None or not hasattr(self._layer, "_quant_out_scale"):
        scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
        scale_name = unique_name.generate(scale_prefix)
        scale_attr = ParamAttr(
            name=scale_name, initializer=Constant(1), trainable=False)
        self._scale = self.create_parameter(
-                shape=[1], attr=scale_attr, dtype=self._dtype)
+            shape=[1], attr=scale_attr, dtype=dtype)
        self._scale.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_scale", self._scale)
-        else:
-            self._scale = self._layer._quant_out_scale

-        if self._layer is None or not hasattr(self._layer, "_quant_out_state"):
        state_prefix = "{}.state".format(name) if name else 'outscale.state'
        state_attr = ParamAttr(
            name=unique_name.generate(state_prefix),
            initializer=Constant(1),
            trainable=False)
        self._state = self.create_parameter(
-                shape=[1], attr=state_attr, dtype=self._dtype)
+            shape=[1], attr=state_attr, dtype=dtype)
        self._state.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_state", self._state)
-        else:
-            self._state = self._layer._quant_out_state

-        if self._layer is None or not hasattr(self._layer, "_quant_out_accum"):
        accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
        accum_attr = ParamAttr(
            name=unique_name.generate(accum_prefix),
            initializer=Constant(1),
            trainable=False)
        self._accum = self.create_parameter(
-                shape=[1], attr=accum_attr, dtype=self._dtype)
+            shape=[1], attr=accum_attr, dtype=dtype)
        self._accum.stop_gradient = True
-            if self._layer is not None:
-                setattr(self._layer, "_quant_out_accum", self._accum)
-        else:
-            self._accum = self._layer._quant_out_accum

    def forward(self, input):
        if in_dygraph_mode():
@@ -567,18 +550,30 @@ class MovingAverageAbsMaxScale(layers.Layer):
                     not self.training)
            state = self._state if self.training else None
            accum = self._accum if self.training else None
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.tmp".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)

-            self._scale, _, _ = core.ops.moving_average_abs_max_scale(
-                input, accum, state, self._scale, state, accum, *attrs)
-            return self._scale
+            out, _, _, _ = core.ops.moving_average_abs_max_scale(
+                input, accum, state, quant_out, self._scale, state, accum,
+                *attrs)
+            return out

        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                 'MovingAverageAbsMaxScale')

        attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training}
-
        inputs = {"X": [input]}
-        outputs = {"OutScale": [self._scale]}
+        quant_out = self._helper.create_variable(
+            name="{}.tmp".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        outputs = {"Out": [quant_out], "OutScale": [self._scale]}

        if self.training:
            inputs['InState'] = [self._state]
@@ -592,4 +587,22 @@ class MovingAverageAbsMaxScale(layers.Layer):
            outputs=outputs,
            attrs=attrs)

-        return self._scale
+        return quant_out
+
+
+class QuantizedOutputLayer(layers.Layer):
+    def __init__(self, layer=None, moving_rate=0.9, dtype='float32'):
+        r"""
+        Add MovingAverageMaxScale layer to the behind of the input layer.
+        """
+        super(QuantizedOutputLayer, self).__init__()
+        self._layer = layer
+        self._moving_average_abs_max_scale = \
+            MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype)
+
+    def forward(self, input):
+        if isinstance(input, list):
+            assert len(input) == 1, \
+                "The QuantizedOutputLayer should only have one input."
+        out = self._layer(input)
+        return self._moving_average_abs_max_scale(out)
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -13,22 +13,7 @@
 # limitations under the License.

 import paddle
-
-op_real_in_out_name = {
-    "conv2d": [["Input", "Filter"], ["Output"]],
-    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
-    "pool2d": [["X"], ["Out"]],
-    "elementwise_add": [["X", "Y"], ["Out"]],
-    "softmax": [["X"], ["Out"]],
-    "relu": [["X"], ["Out"]],
-    "relu6": [["X"], ["Out"]],
-    "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
-    "tanh": [["X"], ["Out"]],
-    "batch_norm": [["X"], ["Y"]],
-    "sigmoid": [["X"], ["Out"]],
-    "swish": [["X"], ["Out"]],
-}
+import numpy as np

 quant_input_layers_map = {
    'Conv2D': paddle.nn.Conv2D,
@@ -85,3 +70,33 @@ weight_op_types = [
    "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
    "depthwise_conv2d_transpose"
 ]
+
+
+def load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Can not find " + var_name + " in the scope."
+    return np.array(var_node.get_tensor())
+
+
+def find_previous_op(block, var_name):
+    """
+    Find the previous op for the input variable.
+    """
+    for op in block.ops:
+        if var_name in op.output_arg_names:
+            return op
+
+
+def find_next_ops(block, var_name):
+    """
+    Find all followed ops for the input variable.
+    """
+    res_ops = []
+    for op in block.ops:
+        if var_name in op.input_arg_names:
+            res_ops.append(op)
+    return res_ops
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -478,30 +478,5 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
        self.assertTrue(op_count == 14)


-class TestSaveQuantizedModel_Warning(unittest.TestCase):
-    def test_warning(self):
-        path = "./dynamic_outscale_infer_model_with_warnings/lenet"
-        imperative_out_scale = ImperativeQuantAware()
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            imperative_out_scale.save_quantized_model(
-                layer=lenet,
-                path=path,
-                input_spec=[
-                    paddle.static.InputSpec(
-                        shape=[None, 1, 28, 28], dtype='float32')
-                ])
-
-        warning_message = "Warning: No Layer of the model while to be " \
-                          "saved contains the out_threshold attribute, so the " \
-                          "generated inference model would not contain the " \
-                          "out_threshold."
-        num = get_vaild_warning_num(warning_message, w)
-        assert num == 1
-
-
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -166,12 +166,14 @@ class TestMovingAverageAbsMaxScaleOp(OpTest):
        accum[0] = 1
        state = np.zeros(1).astype("float32")
        state[0] = 1
+        x = np.random.random((8, 16, 7, 7)).astype("float32")
        self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': x,
            'InAccum': accum,
            'InState': state,
        }

+        out = x
        out_accum = np.zeros(1).astype("float32")
        out_state = np.zeros(1).astype("float32")
        out_scale = np.zeros(1).astype("float32")
@@ -180,6 +182,7 @@ class TestMovingAverageAbsMaxScaleOp(OpTest):
        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
        out_scale = out_accum / out_state
        self.outputs = {
+            'Out': out,
            'OutAccum': out_accum,
            'OutState': out_state,
            'OutScale': out_scale,