[cherry-pick] fix QuantizeLinear pass and support reduce_max in quantization (#44872)

* fix QuantizeLinear kernel and pass in QAT (#44784) * Add Reduce Max in Quant (#44825) Co-authored-by: N Chang Xu <molixu7@gmail.com>

[cherry-pick] fix QuantizeLinear pass and support reduce_max in quantization (#44872)
* fix QuantizeLinear kernel and pass in QAT (#44784) * Add Reduce Max in Quant (#44825) Co-authored-by: N Chang Xu <molixu7@gmail.com>
24b3bbde · Guanghua Yu · GitHub · 245005d4 · 24b3bbde · 24b3bbde
7 changed file
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -139,7 +139,8 @@ struct FindMovingAverageAbsMaxFunctor {
  void operator()(const DeviceContext &ctx,
                  const framework::Tensor &in_accum,
                  const framework::Tensor &in_state,
-                  const framework::Tensor &cur_scale,
+                  const T *cur_scale,
+                  const float rate,
                  framework::Tensor *out_state,
                  framework::Tensor *out_accum,
                  framework::Tensor *out_scale);

--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -93,6 +93,12 @@ class QuantizeLinearOp : public framework::OperatorWithKernel {
        ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
      }
    }
+    if (ctx->HasOutput("OutState")) {
+      ctx->SetOutputDim("OutState", {1});
+    }
+    if (ctx->HasOutput("OutAccum")) {
+      ctx->SetOutputDim("OutAccum", {1});
+    }
    ctx->ShareLoD("X", /*->*/ "Y");
  }

@@ -113,7 +119,25 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Y",
              "(Tensor) Output of quantized low level tensor, "
              "but also saved as float data type.");
-    AddOutput("OutScale", "(Tensor) Current scale").AsDispensable().AsExtra();
+    AddInput("InAccum", "Last accum.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddInput("InState", "Last state.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddOutput("OutState", "(Tensor) state buffer.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddOutput("OutAccum", "(Tensor) accum buffer.")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddOutput("OutScale", "(Tensor) Current scale")
+        .AsDispensable()
+        .AsExtra();  // only qat use
+    AddAttr<float>("moving_rate",
+                   "(float, default 0.9) moving rate.")  // only qat use
+        .SetDefault(0.9)
+        .AsExtra();
    AddAttr<int>("quant_axis",
                 "(int, default 0) The axis for quantization. "
                 "For conv2d, depthwise_conv2d, conv2d_transpose "
@@ -154,8 +178,7 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
                  "nearest ties to even and 1 is rounding to nearest "
                  "ties away from zero.but the received is %d",
                  round_type));
-        })
-        .AsExtra();
+        });
    AddAttr<bool>("is_test",
                  "(bool, default false) Set to true for inference only, false "
                  "for training. Some layers may run faster when this is true.")

--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
@@ -56,10 +56,31 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {

    if (quant_axis < 0) {
      if (!is_test) {
-        auto* out_scale = context.Output<framework::Tensor>("OutScale");
-        T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+        // training
+        auto* in_accum = context.Input<framework::Tensor>("InAccum");
+        auto* in_state = context.Input<framework::Tensor>("InState");
+        auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
+        T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+
        FindAbsMaxFunctor<DeviceContext, T>()(
-            dev_ctx, in->data<T>(), in->numel(), out_s);
+            dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
+
+        auto* out_state = context.Output<framework::Tensor>("OutState");
+        auto* out_accum = context.Output<framework::Tensor>("OutAccum");
+        auto* out_scale = context.Output<framework::Tensor>("OutScale");
+        out_state->mutable_data<T>(context.GetPlace());
+        out_accum->mutable_data<T>(context.GetPlace());
+        out_scale->mutable_data<T>(context.GetPlace());
+        float moving_rate = context.Attr<float>("moving_rate");
+
+        FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                                           *in_accum,
+                                                           *in_state,
+                                                           cur_scale_data,
+                                                           moving_rate,
+                                                           out_state,
+                                                           out_accum,
+                                                           out_scale);
        ClipAndFakeQuantFunctor<DeviceContext, T>()(
            dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
      } else {

--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -418,8 +418,7 @@ class PostTrainingQuantization(object):
            self._update_program()

        # save out_threshold for quantized ops.
-        if not self._onnx_format:
-            self._save_output_threshold()
+        self._save_output_threshold()

        if any(op_type in self._quantizable_op_type
               for op_type in self._dynamic_quantize_op_type):
@@ -996,16 +995,23 @@ class PostTrainingQuantization(object):
        '''
        Save output threshold to the quantized op.
        '''
+        self._calibration_scales = {}

        def save_info(op_node, out_var_name, threshold_map, out_info_name,
                      quantized_type):
            assert out_var_name in threshold_map, \
                "The output ({}) of {} node does not have threshold.".format(
                out_var_name, op_node.type)
-            op_node._set_attr(out_info_name, threshold_map[var_name])
-            op_node._set_attr("with_quant_attr", True)
-            if op_node.type in self._quantizable_op_type:
-                op._set_attr("quantization_type", quantized_type)
+            if self._onnx_format:
+                # For easy extension, every var_node set a dict to save parameters of quant.
+                self._calibration_scales[var_name] = {}
+                self._calibration_scales[var_name]['scale'] = threshold_map[
+                    var_name]
+            else:
+                op_node._set_attr(out_info_name, threshold_map[var_name])
+                op_node._set_attr("with_quant_attr", True)
+                if op_node.type in self._quantizable_op_type:
+                    op._set_attr("quantization_type", quantized_type)

        def analysis_and_save_info(op_node, out_var_name):
            argname_index = utils._get_output_name_index(op_node, out_var_name)

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1785,6 +1785,7 @@ class InsertQuantizeLinear(object):
            equal to 0, it will quantization with per channel, else quantization with per layer.
            Default is -1.
        channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
+        moving_rate(float): the rate for 'moving average' method.
        is_test(bool, optional): Whether quantization with training or not. Default is True.
    """

@@ -1794,6 +1795,7 @@ class InsertQuantizeLinear(object):
                 quant_bits=8,
                 quant_axis=-1,
                 channel_wise=False,
+                 moving_rate=0.9,
                 is_test=True):
        self._place = place
        self._scope = scope
@@ -1801,15 +1803,16 @@ class InsertQuantizeLinear(object):
        self.quant_axis = quant_axis
        self.channel_wise = channel_wise
        self._is_test = is_test
+        self._moving_rate = moving_rate

-    def insert_quant_op(self, graph, var_node):
+    def insert_quant_op(self, graph, var_node, var_name=None):
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
-
-        quant_var_node = graph.create_var_node(name=self._quantized_var_name(
-            var_node.name()),
-                                               var_type=var_node.type(),
-                                               shape=var_node.shape(),
-                                               var_dtype=var_node.dtype())
+        var_name = var_node.name() if not var_name else var_name
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_name),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
        data_type = 'float64' if var_node.dtype(
        ) == core.VarDesc.VarType.FP64 else 'float32'
        if self.channel_wise:
@@ -1821,7 +1824,7 @@ class InsertQuantizeLinear(object):
            scale_var_type = var_node.type()
            init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type)
        scale_var_node = graph.create_persistable_node(
-            name=self._quantized_scale_name(var_node.name()),
+            name=self._quantized_scale_name(var_name),
            var_type=scale_var_type,
            shape=[scale_var_shape],
            var_dtype=var_node.dtype())
@@ -1844,13 +1847,39 @@ class InsertQuantizeLinear(object):
            inputs["ZeroPoint"] = zero_point_node

        attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
+        attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
        outputs = {"Y": quant_var_node}
        if not self._is_test:
-            attrs["is_test"] = self._is_test
-            attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
            scale_out_node = graph.create_var_node_from_desc(
                scale_var_node.var())
+            state_in_node = graph.create_persistable_node(
+                name=unique_name.generate('state'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            data_type = 'float64' if var_node.dtype(
+            ) == core.VarDesc.VarType.FP64 else 'float32'
+            _init_var_node(state_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
+            accum_in_node = graph.create_persistable_node(
+                name=unique_name.generate('accum'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            _init_var_node(accum_in_node, np.ones([1], dtype=data_type),
+                           self._scope, self._place)
+            state_out_node = graph.create_var_node_from_desc(
+                state_in_node.var())
+            accum_out_node = graph.create_var_node_from_desc(
+                accum_in_node.var())
+
            outputs["OutScale"] = scale_out_node
+            inputs['InState'] = state_in_node
+            inputs['InAccum'] = accum_in_node
+            outputs['OutState'] = state_out_node
+            outputs['OutAccum'] = accum_out_node
+            attrs["is_test"] = self._is_test
+            attrs['moving_rate'] = self._moving_rate

        quant_op_node = graph.create_op_node(op_type="quantize_linear",
                                             attrs=attrs,
@@ -1863,6 +1892,10 @@ class InsertQuantizeLinear(object):
            graph.link_to(zero_point_node, quant_op_node)
        graph.link_to(quant_op_node, quant_var_node)
        if not self._is_test:
+            graph.link_to(state_in_node, quant_op_node)
+            graph.link_to(accum_in_node, quant_op_node)
+            graph.link_to(quant_op_node, state_out_node)
+            graph.link_to(quant_op_node, accum_out_node)
            graph.link_to(quant_op_node, scale_out_node)
        return quant_var_node, scale_var_node

@@ -1891,8 +1924,7 @@ class InsertQuantizeLinear(object):
            inputs["ZeroPoint"] = zero_point_node

        attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
-        if not self._is_test:
-            attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
+        attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward

        quant_op_node = graph.create_op_node(op_type="dequantize_linear",
                                             attrs=attrs,
@@ -1931,10 +1963,10 @@ class InsertQuantizeLinear(object):
        return "%s@zero_point" % (var_name)


-class QuantizationTransformPassV2(object):
+class QuantizationTransformPassV2(QuantizationTransformPass):
    """
    Quantize the ops that have weights. Add quant and dequant ops for
-    the quantized ops's inputs.
+    the quantized ops's inputs. It is used in the new format of quantization.
    """

    def __init__(self,
@@ -2130,13 +2162,13 @@ class QuantizationTransformPassV2(object):
                if is_weight and self._weight_quantize_func is not None:
                    target_out_node = self._insert_func(
                        graph, self._weight_quantize_func, var_node, op)
-                    processed_vars.append(name)
+                    self.processed_vars.append(name)
                    continue
                elif not is_weight and self._act_quantize_func is not None:
                    target_out_node = self._insert_func(graph,
                                                        self._act_quantize_func,
                                                        var_node, op)
-                    processed_vars.append(name)
+                    self.processed_vars.append(name)
                    continue

                quant_bits = self._weight_bits if var_node.name() in self.persistable_vars \
@@ -2155,9 +2187,10 @@ class QuantizationTransformPassV2(object):
                    quant_bits=quant_bits,
                    quant_axis=quant_axis,
                    channel_wise=channel_wise,
+                    moving_rate=self._moving_rate,
                    is_test=self._is_test)
                quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
-                    graph, var_node)
+                    graph, var_node, var_name=name)
                dequant_var_node = insert_quant_pass.insert_dequant_op(
                    graph, quant_var_node, scale_var_node)

@@ -2182,24 +2215,6 @@ class QuantizationTransformPassV2(object):
                has_weight = True
        return has_weight

-    def _is_skip_quant(self, graph, op_node):
-        """
-        Analyse whether the op node skips quantization.
-        """
-        is_skip = False
-        if op_node.op().has_attr("skip_quant") and \
-            op_node.op().attr("skip_quant"):
-            is_skip = True
-        # if the inputs of mul and matmul are not all persistable, use
-        # AddQuantDequantPassV2 to quantize them.
-        if op_node.name() in ["mul", "matmul", "matmul_v2"] and \
-            _is_input_all_not_persistable(graph, op_node):
-            is_skip = True
-        if op_node.op().has_attr("quantization_type") and \
-            op_node.op().attr("quantization_type") == "qat_without_weight":
-            is_skip = True
-        return is_skip
-
    def apply(self, graph):
        """
        Quantize the graph for training process. According to weight and
@@ -2250,7 +2265,7 @@ class QuantizationTransformPassV2(object):
 class AddQuantDequantPassV2(object):
    """
    Quantize the ops that do not have weights, and add quant_linear and dequant_linear
-    op for the quantized ops's inputs.
+    op for the quantized ops's inputs. It is used in the new format of quantization.
    """

    # To be compatible with PaddleSlim, not remove _activation_type for now
@@ -2377,6 +2392,7 @@ class AddQuantDequantPassV2(object):
                                quant_bits=self._quant_bits,
                                quant_axis=-1,
                                channel_wise=False,
+                                moving_rate=self._moving_rate,
                                is_test=self._is_test)
                            quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                                graph, in_node)

--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -109,6 +109,7 @@ _act_supported_quantizable_op_type = [
    "square",
    "softplus",
    "shuffle_channel",
+    "reduce_max",
 ]

 _out_scale_op_list = list(
@@ -213,6 +214,7 @@ _op_real_in_out_name = {
    "square": [["X"], ["Out"]],
    "softplus": [["X"], ["Out"]],
    "shuffle_channel": [["X"], ["Out"]],
+    "reduce_max": [["X"], ["Out"]],
 }



--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -550,18 +550,41 @@ class TestquantizeOpTrain(TestquantizeOp):
    def setUp(self):
        self.set_args()
        self.op_type = "quantize_linear"
-        x = np.random.randn(31, 65).astype(self.data_type)
-        yq, scale = quantize_max_abs(x, self.max_range)
-        scale = np.array(scale).astype(self.data_type)
-        zero_point = np.zeros(scale.shape, dtype="int32")
-
-        self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
        self.attrs = {
            'bit_length': self.bit_length,
            'quant_axis': self.quant_axis,
+            'moving_rate': 0.9,
            'is_test': self.is_test
        }
-        self.outputs = {'Y': yq, 'OutScale': scale}
+
+        x = np.random.randn(31, 65).astype(self.data_type)
+        scale = np.array([0.001]).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        in_accum = np.ones(1).astype(self.data_type)
+        in_state = np.ones(1).astype(self.data_type)
+        out_accum = np.zeros(1).astype(self.data_type)
+        out_state = np.zeros(1).astype(self.data_type)
+        out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(x))
+        out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0
+        out_scale = out_accum / out_state
+
+        round_out = np.round(x / out_scale * self.max_range)
+        quant_data = np.clip(round_out, -self.max_range - 1, self.max_range)
+
+        self.inputs = {
+            'X': x,
+            'Scale': scale,
+            'ZeroPoint': zero_point,
+            'InAccum': in_accum,
+            'InState': in_state,
+        }
+        self.outputs = {
+            'Y': quant_data,
+            'OutScale': out_scale,
+            'OutAccum': out_accum,
+            'OutState': out_state,
+        }

    def test_check_output(self):
        self.check_output()