fix quantization clip and round Attribute (#43764)

491b87b4 · Guanghua Yu · GitHub · 2739bd73 · 491b87b4 · 491b87b4
12 changed file
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -33,8 +33,10 @@ struct Compare {

 template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
-                  const int num, T* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const T *in,
+                  const int num,
+                  T *out) {
    *out = std::abs(*(std::max_element(in + 0, in + num, Compare<T>())));
  }
 };
@@ -43,24 +45,26 @@ template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;

 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in_tensor, const int quant_axis,
-                  T* out_abs_max) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in_tensor,
+                  const int quant_axis,
+                  T *out_abs_max) {
    // At present, channelwise quantization supports conv2d, depthwise_conv2d
    // conv2d_transpose and mul
    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                          "the received is %d",
                                          quant_axis));
-    auto* in_data = in_tensor.data<T>();
+    auto *in_data = in_tensor.data<T>();
    auto in_dims = in_tensor.dims();
    const int64_t channel = in_dims[quant_axis];
    if (quant_axis == 0) {
      const int64_t channel_size = in_tensor.numel() / channel;
      for (int64_t i = 0; i < channel; i++) {
-        auto* start = in_data + i * channel_size;
-        auto* end = in_data + (i + 1) * channel_size;
+        auto *start = in_data + i * channel_size;
+        auto *end = in_data + (i + 1) * channel_size;
        out_abs_max[i] =
            std::abs(*(std::max_element(start, end, Compare<T>())));
      }
@@ -72,8 +76,8 @@ struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
      const int64_t step_j = in_tensor.numel() / (in_dims[0] * in_dims[1]);
      for (int64_t i = 0; i < in_dims[0]; i++) {
        for (int64_t j = 0; j < in_dims[1]; j++) {
-          auto* start = in_data + i * step_i + j * step_j;
-          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto *start = in_data + i * step_i + j * step_j;
+          auto *end = in_data + i * step_i + (j + 1) * step_j;
          T abs_max = std::abs(*(std::max_element(start, end, Compare<T>())));
          out_abs_max[j] = std::max(out_abs_max[j], abs_max);
        }
@@ -86,16 +90,30 @@ template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;

 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
    T s = scale.data<T>()[0];
    T inv_s = inverse(s);
    platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()),
-          QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
+    if (round_type == 0) {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+    } else {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            phi::ClipFunctor<T>(-s, s));
+      auto out_e = framework::EigenVector<T>::Flatten(*out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+    }
  }
 };

@@ -103,19 +121,34 @@ template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;

 template <typename T>
 struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
    T s = scale.data<T>()[0];
    T inv_s = inverse(s);

    platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()),
-          QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(*ctx.eigen_device()) = out_e * s / static_cast<T>(bin_cnt);
+    if (round_type == 0) {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+      auto out_e = framework::EigenVector<T>::Flatten(*out);
+      out_e.device(*ctx.eigen_device()) = out_e * s / static_cast<T>(bin_cnt);
+    } else {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            phi::ClipFunctor<T>(-s, s));
+      auto out_e = framework::EigenVector<T>::Flatten(*out);
+      out_e.device(*ctx.eigen_device()) =
+          (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+    }
  }
 };
 template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
@@ -123,20 +156,24 @@ template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,

 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
    // At present, channelwise quantization supports conv2d, depthwise_conv2d
    // conv2d_transpose and mul
    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                          "the received is %d",
                                          quant_axis));
-    auto* scale_data = scale.data<T>();
-    auto* in_data = in.data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto *scale_data = scale.data<T>();
+    auto *in_data = in.data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
    auto in_dims = in.dims();
    const int64_t channel = in_dims[quant_axis];
    platform::Transform<platform::CPUDeviceContext> trans;
@@ -144,12 +181,31 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
      const int64_t channel_size = in.numel() / channel;
      for (int64_t i = 0; i < channel; i++) {
        T s = scale_data[i];
-        auto* start = in_data + i * channel_size;
-        auto* end = in_data + (i + 1) * channel_size;
+        auto *start = in_data + i * channel_size;
+        auto *end = in_data + (i + 1) * channel_size;
        T inv_s = inverse(s);
-        trans(
-            ctx, start, end, out_data + i * channel_size,
-            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
+        if (round_type == 0) {
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+        } else {
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                phi::ClipFunctor<T>(-s, s));
+        }
+      }
+      if (round_type == 1) {
+        for (int64_t i = 0; i < channel; i++) {
+          T s = scale_data[i];
+          T inv_s = inverse(s);
+          framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+        }
      }
    } else if (quant_axis == 1) {
      const int64_t step_i = in.numel() / in_dims[0];
@@ -158,12 +214,21 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
        for (int j = 0; j < in_dims[1]; j++) {
          T s = scale_data[j];
          T inv_s = inverse(s);
-          auto* start = in_data + i * step_i + j * step_j;
-          auto* end = in_data + i * step_i + (j + 1) * step_j;
-          auto* cur_out_data = out_data + i * step_i + j * step_j;
-          trans(ctx, start, end, cur_out_data,
-                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type,
-                                      inv_s));
+          auto *start = in_data + i * step_i + j * step_j;
+          auto *end = in_data + i * step_i + (j + 1) * step_j;
+          auto *cur_out_data = out_data + i * step_i + j * step_j;
+          if (round_type == 0) {
+            trans(ctx,
+                  start,
+                  end,
+                  cur_out_data,
+                  QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+          } else {
+            trans(ctx, start, end, cur_out_data, phi::ClipFunctor<T>(-s, s));
+            for (int k = 0; k < step_j; k++) {
+              cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]);
+            }
+          }
        }
      }
    }
@@ -174,19 +239,23 @@ template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
                                               float>;
 template <typename T>
 struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                          "the received is %d",
                                          quant_axis));

-    auto* scale_data = scale.data<T>();
-    auto* in_data = in.data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto *scale_data = scale.data<T>();
+    auto *in_data = in.data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
    auto in_dims = in.dims();
    const int64_t channel = in_dims[quant_axis];
    platform::Transform<platform::CPUDeviceContext> trans;
@@ -194,15 +263,35 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
      const int64_t channel_size = in.numel() / channel;
      for (int i = 0; i < channel; i++) {
        T s = scale_data[i];
-        auto* start = in_data + i * channel_size;
-        auto* end = in_data + (i + 1) * channel_size;
-        T inv_s = inverse(s);
-        trans(
-            ctx, start, end, out_data + i * channel_size,
-            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
+        auto *start = in_data + i * channel_size;
+        auto *end = in_data + (i + 1) * channel_size;
+        if (round_type == 0) {
+          T inv_s = inverse(s);
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+        } else {
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                phi::ClipFunctor<T>(-s, s));
+        }
+      }
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
        framework::Tensor one_channel_out = out->Slice(i, i + 1);
        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        out_e.device(*ctx.eigen_device()) = out_e * s / static_cast<T>(bin_cnt);
+        if (round_type == 0) {
+          out_e.device(*ctx.eigen_device()) =
+              out_e * s / static_cast<T>(bin_cnt);
+        } else {
+          T inv_s = inverse(s);
+          out_e.device(*ctx.eigen_device()) =
+              (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+        }
      }
    } else if (quant_axis == 1) {
      const int64_t step_i = in.numel() / in_dims[0];
@@ -211,14 +300,25 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
        for (int j = 0; j < in_dims[1]; j++) {
          T s = scale_data[j];
          T inv_s = inverse(s);
-          auto* start = in_data + i * step_i + j * step_j;
-          auto* end = in_data + i * step_i + (j + 1) * step_j;
-          auto* cur_out_data = out_data + i * step_i + j * step_j;
-          trans(ctx, start, end, cur_out_data,
-                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type,
-                                      inv_s));
+          auto *start = in_data + i * step_i + j * step_j;
+          auto *end = in_data + i * step_i + (j + 1) * step_j;
+          auto *cur_out_data = out_data + i * step_i + j * step_j;
+          if (round_type == 0) {
+            trans(ctx,
+                  start,
+                  end,
+                  cur_out_data,
+                  QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+          } else {
+            trans(ctx, start, end, cur_out_data, phi::ClipFunctor<T>(-s, s));
+          }
          for (int k = 0; k < step_j; k++) {
-            cur_out_data[k] = cur_out_data[k] * s / static_cast<T>(bin_cnt);
+            if (round_type == 0) {
+              cur_out_data[k] = cur_out_data[k] * s / static_cast<T>(bin_cnt);
+            } else {
+              cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
+                                s / static_cast<T>(bin_cnt);
+            }
          }
        }
      }
@@ -230,12 +330,14 @@ template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
                                                   float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &cur_scale,
+                  const framework::Tensor &last_scale,
+                  const framework::Tensor &iter,
+                  const int window_size,
+                  framework::Tensor *scales_arr,
+                  framework::Tensor *out_scale) {
+    T *scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
    int64_t it = iter.data<int64_t>()[0];
    int idx = it % window_size;
    T removed = scale_arr[idx];
@@ -247,8 +349,8 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
      max = cur;
    } else if (fabs(removed - max) < 1e-6) {
      int size = (it > window_size) ? window_size : it;
-      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
-                                                         &max);
+      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(
+          ctx, scale_arr, size, &max);
    }
    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
  }
@@ -258,11 +360,14 @@ template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;

 template <typename T>
 struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in_accum,
+                  const framework::Tensor &in_state,
+                  const T *cur_scale,
+                  const float rate,
+                  framework::Tensor *out_state,
+                  framework::Tensor *out_accum,
+                  framework::Tensor *out_scale) {
    T accum = in_accum.data<T>()[0];
    T state = in_state.data<T>()[0];
    T scale = cur_scale[0];
@@ -282,18 +387,22 @@ template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,

 class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {
 public:
-  FakeQuantOrWithDequantAbsMaxOp(const std::string& type,
-                                 const framework::VariableNameMap& inputs,
-                                 const framework::VariableNameMap& outputs,
-                                 const framework::AttributeMap& attrs)
+  FakeQuantOrWithDequantAbsMaxOp(const std::string &type,
+                                 const framework::VariableNameMap &inputs,
+                                 const framework::VariableNameMap &outputs,
+                                 const framework::AttributeMap &attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "FakeQuantOrWithDequantAbsMaxOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(
+        ctx->HasInput("X"), "Input", "X", "FakeQuantOrWithDequantAbsMaxOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                   "FakeQuantOrWithDequantAbsMaxOp");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                   "FakeQuantOrWithDequantAbsMaxOp");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->SetOutputDim("OutScale", {1});
@@ -302,7 +411,7 @@ class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
        ctx.device_context());
@@ -320,8 +429,9 @@ class FakeQuantOrWithDequantAbsMaxOpMaker
    AddOutput("OutScale", "(Tensor) Current scale");
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                            platform::errors::InvalidArgument(
                                "'bit_length' should be between 1 and 16, but "
                                "the received is %d",
@@ -329,18 +439,22 @@ class FakeQuantOrWithDequantAbsMaxOpMaker
        });
    AddAttr<int>(
        "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
        "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
    AddComment(R"DOC(
 This is a Base Op which supports FakeQuantAbsMaxOpMaker and FakeQuantDequantAbsMaxOpMaker.
 FakeQuantAbsMaxOp operator is used in the dynamic quantization.
@@ -363,12 +477,16 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "FakeChannelWiseQuantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(
+        ctx->HasInput("X"), "Input", "X", "FakeChannelWiseQuantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                   "FakeChannelWiseQuantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                   "FakeChannelWiseQuantizeAbsMax");
    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
@@ -378,7 +496,7 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
  }
@@ -398,8 +516,9 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                 "For conv2d, depthwise_conv2d, conv2d_transpose "
                 "and mul, the quant_axis is equal to the cout axis.")
        .SetDefault(0)
-        .AddCustomChecker([](const int& quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+        .AddCustomChecker([](const int &quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
+                            true,
                            platform::errors::InvalidArgument(
                                "'quant_axis' should be 0 or 1, but "
                                "the received is %d",
@@ -407,8 +526,9 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
        });
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                            platform::errors::InvalidArgument(
                                "'bit_length' should be between 1 and 16, but "
                                "the received is %d",
@@ -416,18 +536,22 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
        });
    AddAttr<int>(
        "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
        "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
    AddAttr<bool>("is_test",
                  "(bool, default false) Set to true for inference only, false "
                  "for training. Some layers may run faster when this is true.")
@@ -450,12 +574,18 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOp
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"),
+                   "Input",
+                   "X",
                   "FakeChannelWiseQuantizeDequantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                   "FakeChannelWiseQuantizeDequantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                   "FakeChannelWiseQuantizeDequantizeAbsMax");
    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
@@ -465,7 +595,7 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOp

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
  }
@@ -485,8 +615,9 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
                 "For conv2d, depthwise_conv2d, conv2d_transpose "
                 "and mul, the quant_axis is equal to the cout axis.")
        .SetDefault(0)
-        .AddCustomChecker([](const int& quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+        .AddCustomChecker([](const int &quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
+                            true,
                            platform::errors::InvalidArgument(
                                "'quant_axis' should be 0 or 1, but "
                                "the received is %d",
@@ -494,8 +625,9 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
        });
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                            platform::errors::InvalidArgument(
                                "'bit_length' should be between 1 and 16, but "
                                "the received is %d",
@@ -503,18 +635,22 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
        });
    AddAttr<int>(
        "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
        "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
    AddComment(R"DOC(
 The scale of FakeChannelWiseQuantize operator is a vector.
 In detail, each channel of the input X has a scale value.
@@ -530,17 +666,19 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$

 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
 public:
-  FakeQuantizeRangeAbsMaxOp(const std::string& type,
-                            const framework::VariableNameMap& inputs,
-                            const framework::VariableNameMap& outputs,
-                            const framework::AttributeMap& attrs)
+  FakeQuantizeRangeAbsMaxOp(const std::string &type,
+                            const framework::VariableNameMap &inputs,
+                            const framework::VariableNameMap &outputs,
+                            const framework::AttributeMap &attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FakeQuantizeRangeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "FakeQuantizeRangeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Out"), "Output", "Out", "FakeQuantizeRangeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                   "FakeQuantizeRangeAbsMax");
    if (ctx->HasOutput("OutScales")) {
      int window_size = ctx->Attrs().Get<int>("window_size");
@@ -553,7 +691,7 @@ class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
        ctx.device_context());
@@ -574,8 +712,9 @@ class FakeQuantizeRangeAbsMaxOpMaker
        .SetDefault(10000);
    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                            platform::errors::InvalidArgument(
                                "'bit_length' should be between 1 and 16, but "
                                "the received is %d",
@@ -583,18 +722,22 @@ class FakeQuantizeRangeAbsMaxOpMaker
        });
    AddAttr<int>(
        "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
        "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
    AddAttr<bool>("is_test",
                  "(bool, default false) Set to true for inference only, false "
                  "for training. Some layers may run faster when this is true.")
@@ -614,17 +757,24 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOp
    : public framework::OperatorWithKernel {
 public:
  FakeQuantOrWithDequantMovingAverageAbsMaxOp(
-      const std::string& type, const framework::VariableNameMap& inputs,
-      const framework::VariableNameMap& outputs,
-      const framework::AttributeMap& attrs)
+      const std::string &type,
+      const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"),
+                   "Input",
+                   "X",
                   "FakeQuantOrWithDequantMovingAverageAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                   "FakeQuantOrWithDequantMovingAverageAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                   "FakeQuantOrWithDequantMovingAverageAbsMax");
    if (ctx->HasOutput("OutState")) {
      ctx->SetOutputDim("OutState", {1});
@@ -639,7 +789,7 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOp

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
        ctx.device_context());
@@ -662,8 +812,9 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
        .SetDefault(0.9);
    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                            platform::errors::InvalidArgument(
                                "'bit_length' should be between 1 and 16, but "
                                "the received is %d",
@@ -671,18 +822,22 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
        });
    AddAttr<int>(
        "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
        "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
    AddAttr<bool>("is_test",
                  "(bool, default false) Set to true for inference only, false "
                  "for training. Some layers may run faster when this is true.")
@@ -709,10 +864,12 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "MovingAverageAbsMaxScale");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(
+        ctx->HasInput("X"), "Input", "X", "MovingAverageAbsMaxScale");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                   "MovingAverageAbsMaxScale");

    if (ctx->HasOutput("OutState")) {
@@ -730,7 +887,7 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
  }
@@ -770,19 +927,23 @@ class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    auto out_grad_name = framework::GradVarName("Out");
    auto x_grad_name = framework::GradVarName("X");
-    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name),
+                   "Input",
+                   out_grad_name,
                   "StrightThroughEstimatorGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name),
+                   "Output",
+                   x_grad_name,
                   "StrightThroughEstimatorGradOp");

    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
  }

  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
        ctx, framework::GradVarName("Out"));
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
@@ -810,7 +971,8 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;

 REGISTER_OPERATOR(
-    fake_quantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    fake_quantize_abs_max,
+    ops::FakeQuantOrWithDequantAbsMaxOp,
    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
@@ -818,7 +980,8 @@ REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);

 REGISTER_OPERATOR(
-    fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    fake_quantize_dequantize_abs_max,
+    ops::FakeQuantOrWithDequantAbsMaxOp,
    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
@@ -826,7 +989,8 @@ REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
                       ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);

 REGISTER_OPERATOR(
-    fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
+    fake_quantize_range_abs_max,
+    ops::FakeQuantizeRangeAbsMaxOp,
    ops::FakeQuantizeRangeAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
@@ -853,7 +1017,8 @@ REGISTER_OP_CPU_KERNEL(
    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);

 REGISTER_OPERATOR(
-    fake_channel_wise_quantize_abs_max, ops::FakeChannelWiseQuantizeAbsMaxOp,
+    fake_channel_wise_quantize_abs_max,
+    ops::FakeChannelWiseQuantizeAbsMaxOp,
    ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
@@ -861,7 +1026,8 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
                       ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);

 REGISTER_OPERATOR(
-    moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
+    moving_average_abs_max_scale,
+    ops::MovingAverageAbsMaxScaleOp,
    ops::MovingAverageAbsMaxScaleOpMaker,
    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);

--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -36,12 +36,12 @@ struct QuantizeDataType<paddle::platform::float16> {
 };

 template <typename T>
-__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
+__global__ void FindAbsMaxKernel(const T *in, const int n, T *out) {
  int bid = threadIdx.x + blockIdx.x * blockDim.x;
  int tid = threadIdx.x;

-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+  extern __shared__ char *shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T *>(shared_max_data_tmp);
  if (gridDim.x > 1) {
    T local_max_data = T(0);
    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
@@ -73,14 +73,16 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {

 template <typename T>
 struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, T* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const T *in,
+                  const int num,
+                  T *out) {
    int block = 1024;
    int grid = (block - 1 + num) / block;
    grid = (grid > block) ? block : grid;

    framework::Tensor max;
-    T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
+    T *max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
    FindAbsMaxKernel<T>
        <<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(in, num, max_data);
    FindAbsMaxKernel<T>
@@ -93,13 +95,15 @@ template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
                                  paddle::platform::float16>;

 template <typename T>
-__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
-                                                  const int c, T* out) {
+__global__ void FindChannelAbsMaxKernelQuantAxis0(const T *in,
+                                                  const int n,
+                                                  const int c,
+                                                  T *out) {
  int tid = threadIdx.x;
  int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+  const T *in_c = in + blockIdx.x * channel_size;
+  extern __shared__ char *shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T *>(shared_max_data_tmp);
  T local_max_data = T(0);
  for (int i = tid; i < channel_size; i += blockDim.x) {
    T tmp = static_cast<T>(
@@ -122,17 +126,16 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
 }

 template <typename T>
-__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
-                                                  const int cin, const int cout,
-                                                  T* out) {
-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+__global__ void FindChannelAbsMaxKernelQuantAxis1(
+    const T *in, const int n, const int cin, const int cout, T *out) {
+  extern __shared__ char *shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T *>(shared_max_data_tmp);
  int cout_wh_size = n / cin;
  int wh_size = n / (cin * cout);

  int tid = threadIdx.x;
  int bid = blockIdx.x;
-  const T* in_current = in + tid * cout_wh_size + bid * wh_size;
+  const T *in_current = in + tid * cout_wh_size + bid * wh_size;
  T local_max_data = T(0);
  for (int i = 0; i < wh_size; i++) {
    T tmp = static_cast<T>(
@@ -162,24 +165,26 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,

 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_tensor, const int quant_axis,
-                  T* out_abs_max) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in_tensor,
+                  const int quant_axis,
+                  T *out_abs_max) {
    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                          "the received is %d",
                                          quant_axis));
    const int num = in_tensor.numel();
    auto in_dims = in_tensor.dims();
-    const T* in_data = in_tensor.data<T>();
+    const T *in_data = in_tensor.data<T>();
    if (quant_axis == 0) {
      int cout = in_dims[0];
      int grid = cout;
      int block = 1024;
      FindChannelAbsMaxKernelQuantAxis0<T>
-          <<<grid, block, block * sizeof(T), ctx.stream()>>>(in_data, num, cout,
-                                                             out_abs_max);
+          <<<grid, block, block * sizeof(T), ctx.stream()>>>(
+              in_data, num, cout, out_abs_max);
    } else if (quant_axis == 1) {
      int cin = in_dims[0];
      int cout = in_dims[1];
@@ -213,9 +218,12 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;

 template <typename T>
-__global__ void ClipAndQuantKernel(const T* in, const T* scale,
-                                   const int bin_cnt, const int round_type,
-                                   const int n, T* out) {
+__global__ void ClipAndQuantKernel(const T *in,
+                                   const T *scale,
+                                   const int bin_cnt,
+                                   const int round_type,
+                                   const int n,
+                                   T *out) {
  int bid = threadIdx.x + blockIdx.x * blockDim.x;
  int tid = threadIdx.x;

@@ -227,25 +235,30 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,

  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
-    x = bin_cnt_t * inv_s * x;
    if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
      x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out[i] = static_cast<T>(x);
    } else {
-      x = round(x);
+      ComputeDataType v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt_t * inv_s * v;
+      out[i] = static_cast<T>(round(v));
    }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out[i] = static_cast<T>(x);
  }
 }

 template <typename T>
-__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
+__global__ void ClipAndQuantDequantKernel(const T *in,
+                                          const T *scale,
                                          const int bin_cnt,
-                                          const int round_type, const int n,
-                                          T* out) {
+                                          const int round_type,
+                                          const int n,
+                                          T *out) {
  int bid = threadIdx.x + blockIdx.x * blockDim.x;
  int tid = threadIdx.x;

@@ -257,33 +270,39 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,

  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
-    x = bin_cnt_t * inv_s * x;
    if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
      x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out[i] = static_cast<T>((x * s) / bin_cnt_t);
    } else {
+      x = x > s ? s : x;
+      x = x < -s ? -s : x;
+      x = bin_cnt_t * inv_s * x;
      x = round(x);
+      out[i] = static_cast<T>((x * s) / bin_cnt_t);
    }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out[i] = static_cast<T>((x * s) / bin_cnt_t);
  }
 }

 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
    int num = in.numel();
    int block = 1024;
    int grid = (block - 1 + num) / block;

-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());

    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
        in_data, scale_data, bin_cnt, round_type, num, out_data);
@@ -294,17 +313,19 @@ template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;

 template <typename T>
 struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
    int num = in.numel();
    int block = 1024;
    int grid = (block - 1 + num) / block;

-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());

    ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
        in_data, scale_data, bin_cnt, round_type, num, out_data);
@@ -313,16 +334,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {

 // ChannelClipAndQuantKernel for quant_axis is 0
 template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
+__global__ void ChannelClipAndQuantKernelQuantAxis0(const T *in,
+                                                    const T *scale,
                                                    const int bin_cnt,
                                                    const int round_type,
                                                    const int64_t n,
-                                                    const int c, T* out) {
+                                                    const int c,
+                                                    T *out) {
  int tid = threadIdx.x;

  int64_t channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
+  const T *in_c = in + blockIdx.x * channel_size;
+  T *out_c = out + blockIdx.x * channel_size;

  using ComputeDataType = typename QuantizeDataType<T>::type;

@@ -332,25 +355,33 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,

  for (int64_t i = tid; i < channel_size; i += blockDim.x) {
    ComputeDataType x = static_cast<ComputeDataType>(in_c[i]);
-    x = bin_cnt_t * inv_s * x;
    if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
      x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out_c[i] = static_cast<T>(x);
    } else {
-      x = round(x);
+      ComputeDataType v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt_t * inv_s * v;
+      out_c[i] = static_cast<T>(round(v));
    }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out_c[i] = static_cast<T>(x);
  }
 }

 // ChannelClipAndQuantKernel for quant_axis is N
 template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxisN(
-    const T* in, const T* scale, const int bin_cnt, const int round_type,
-    const int64_t n, const int nScale, const int quant_stride, T* out) {
+__global__ void ChannelClipAndQuantKernelQuantAxisN(const T *in,
+                                                    const T *scale,
+                                                    const int bin_cnt,
+                                                    const int round_type,
+                                                    const int64_t n,
+                                                    const int nScale,
+                                                    const int quant_stride,
+                                                    T *out) {
  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
  using ComputeDataType = typename QuantizeDataType<T>::type;
  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
@@ -359,37 +390,44 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
        static_cast<ComputeDataType>(scale[(i / quant_stride) % nScale]);
    ComputeDataType inv_s = inverse(s);
    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
-    x = bin_cnt_t * inv_s * x;
    if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
      x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out[i] = static_cast<T>(x);
    } else {
-      x = round(x);
+      ComputeDataType v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt_t * inv_s * v;
+      out[i] = static_cast<T>(round(v));
    }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out[i] = static_cast<T>(x);
  }
 }

 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                          "the received is %d",
                                          quant_axis));

    int64_t num = in.numel();
    auto in_dims = in.dims();
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());

    if (quant_axis == 0) {
      int grid = in_dims[0];
@@ -411,9 +449,15 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
      const int64_t grid_size =
          std::min(max_blocks, (num + block_size - 1) / block_size);

-      ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
-          in_data, scale_data, bin_cnt, round_type, num, in_dims[quant_axis],
-          quant_stride, out_data);
+      ChannelClipAndQuantKernelQuantAxisN<T>
+          <<<grid_size, block_size>>>(in_data,
+                                      scale_data,
+                                      bin_cnt,
+                                      round_type,
+                                      num,
+                                      in_dims[quant_axis],
+                                      quant_stride,
+                                      out_data);
    }
  }
 };
@@ -422,12 +466,14 @@ template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
                                               float>;

 template <typename T>
-__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
-                                            const T* last_scale,
-                                            const int64_t* iter,
-                                            const int window_size, T* scale_arr,
-                                            T* out_scale, int* need_find_max,
-                                            int* out_size) {
+__global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale,
+                                            const T *last_scale,
+                                            const int64_t *iter,
+                                            const int window_size,
+                                            T *scale_arr,
+                                            T *out_scale,
+                                            int *need_find_max,
+                                            int *out_size) {
  int it = iter[0];
  int idx = it % window_size;
  T removed = scale_arr[idx];
@@ -446,45 +492,63 @@ __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,

 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &cur_scale,
+                  const framework::Tensor &last_scale,
+                  const framework::Tensor &iter,
+                  const int window_size,
+                  framework::Tensor *scales_arr,
+                  framework::Tensor *out_scale) {
    const auto gpu_place = ctx.GetPlace();

-    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+    T *scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T *out_scale_data = out_scale->mutable_data<T>(gpu_place);

    framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
-    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
-
-    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
-        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
-        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+    int *find_max = need_find_max.mutable_data<int>({1}, gpu_place);
+    int *out_size_data = out_size.mutable_data<int>({1}, gpu_place);
+
+    FindRangeAbsMaxAndFillArray<T>
+        <<<1, 1, 0, ctx.stream()>>>(cur_scale.data<T>(),
+                                    last_scale.data<T>(),
+                                    iter.data<int64_t>(),
+                                    window_size,
+                                    scale_arr,
+                                    out_scale_data,
+                                    find_max,
+                                    out_size_data);

    int g_find_max;
-    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
-                 sizeof(int), ctx.stream());
+    memory::Copy(platform::CPUPlace(),
+                 &g_find_max,
+                 gpu_place,
+                 find_max,
+                 sizeof(int),
+                 ctx.stream());
    ctx.Wait();
    if (g_find_max) {
      int len;
-      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
-                   sizeof(int), ctx.stream());
+      memory::Copy(platform::CPUPlace(),
+                   &len,
+                   gpu_place,
+                   out_size_data,
+                   sizeof(int),
+                   ctx.stream());
      ctx.Wait();
-      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
-                                                          out_scale_data);
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(
+          ctx, scale_arr, len, out_scale_data);
    }
  }
 };

 template <typename T>
-__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
-                                              const T* in_accum,
-                                              const T* cur_scale, const T rate,
-                                              T* out_state, T* out_accum,
-                                              T* out_scale) {
+__global__ void FindMovingAverageAbsMaxKernel(const T *in_state,
+                                              const T *in_accum,
+                                              const T *cur_scale,
+                                              const T rate,
+                                              T *out_state,
+                                              T *out_accum,
+                                              T *out_scale) {
  T state = rate * (*in_state) + T(1.0f);
  T accum = rate * (*in_accum) + (*cur_scale);
  *out_state = state;
@@ -496,92 +560,119 @@ template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;

 template <typename T>
 struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in_accum,
+                  const framework::Tensor &in_state,
+                  const T *cur_scale,
+                  const float rate,
+                  framework::Tensor *out_state,
+                  framework::Tensor *out_accum,
+                  framework::Tensor *out_scale) {
    const auto gpu_place = ctx.GetPlace();

    T rate_t = static_cast<T>(rate);
-    T* out_state_data = out_state->mutable_data<T>(gpu_place);
-    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
-
-    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
-        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
-        out_state_data, out_accum_data, out_scale_data);
+    T *out_state_data = out_state->mutable_data<T>(gpu_place);
+    T *out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T *out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    FindMovingAverageAbsMaxKernel<T>
+        <<<1, 1, 0, ctx.stream()>>>(in_state.data<T>(),
+                                    in_accum.data<T>(),
+                                    cur_scale,
+                                    rate_t,
+                                    out_state_data,
+                                    out_accum_data,
+                                    out_scale_data);
  }
 };

 // ChannelClipAndQuantDequantKernel for quant_axis is 0
 template <typename T>
-__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
-    const T* in, const T* scale, const int bin_cnt, const int round_type,
-    const int n, const int c, T* out) {
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(const T *in,
+                                                           const T *scale,
+                                                           const int bin_cnt,
+                                                           const int round_type,
+                                                           const int n,
+                                                           const int c,
+                                                           T *out) {
  int tid = threadIdx.x;

  int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
+  const T *in_c = in + blockIdx.x * channel_size;
+  T *out_c = out + blockIdx.x * channel_size;

  T s = scale[blockIdx.x];
  T inv_s = inverse(s);

  for (int i = tid; i < channel_size; i += blockDim.x) {
    T x = in_c[i];
-    x = bin_cnt * inv_s * x;
    if (round_type == 0) {
+      x = bin_cnt * inv_s * x;
      x = roundWithTiesToEven(x);
+      T max_bound = bin_cnt;
+      T min_bound = -bin_cnt - static_cast<T>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out_c[i] = (x * s) / bin_cnt;
    } else {
-      x = round(x);
+      T v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt * inv_s * v;
+      out_c[i] = round(v) * s / bin_cnt;
    }
-    T max_bound = bin_cnt;
-    T min_bound = -bin_cnt - static_cast<T>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out_c[i] = (x * s) / bin_cnt;
  }
 }

 // ChannelClipAndQuantDequantKernel for quant_axis is 1
 template <typename T>
-__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
-    const T* in, const T* scale, const int bin_cnt, const int round_type,
-    const int n, const int cin, const int cout, T* out) {
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
+                                                           const T *scale,
+                                                           const int bin_cnt,
+                                                           const int round_type,
+                                                           const int n,
+                                                           const int cin,
+                                                           const int cout,
+                                                           T *out) {
  T s = scale[blockIdx.x % cout];
  T inv_s = inverse(s);

  int wh_size = n / (cin * cout);
-  const T* in_c = in + blockIdx.x * wh_size;
-  T* out_c = out + blockIdx.x * wh_size;
+  const T *in_c = in + blockIdx.x * wh_size;
+  T *out_c = out + blockIdx.x * wh_size;

  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
    T x = in_c[i];
-    x = bin_cnt * inv_s * x;
    if (round_type == 0) {
+      x = bin_cnt * inv_s * x;
      x = roundWithTiesToEven(x);
+      T max_bound = bin_cnt;
+      T min_bound = -bin_cnt - static_cast<T>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out_c[i] = (x * s) / bin_cnt;
    } else {
-      x = round(x);
+      T v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt * inv_s * v;
+      out_c[i] = round(v) * s / bin_cnt;
    }
-    T max_bound = bin_cnt;
-    T min_bound = -bin_cnt - static_cast<T>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out_c[i] = (x * s) / bin_cnt;
  }
 }

 template <typename T>
 struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
    // At present, channelwise quantization supports conv2d, depthwise_conv2d
    // conv2d_transpose and mul
    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                          "the received is %d",
                                          quant_axis));
@@ -589,25 +680,34 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
    int num = in.numel();
    auto in_dims = in.dims();

-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());

    if (quant_axis == 0) {
      int grid = in_dims[0];
      int block = 1024;
      ChannelClipAndQuantDequantKernelQuantAxis0<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                             round_type, num, in_dims[0],
+          <<<grid, block, 0, ctx.stream()>>>(in_data,
+                                             scale_data,
+                                             bin_cnt,
+                                             round_type,
+                                             num,
+                                             in_dims[0],
                                             out_data);
    } else if (quant_axis == 1) {
      int grid = in_dims[0] * in_dims[1];
      int block = 1024;

      ChannelClipAndQuantDequantKernelQuantAxis1<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                             round_type, num, in_dims[0],
-                                             in_dims[1], out_data);
+          <<<grid, block, 0, ctx.stream()>>>(in_data,
+                                             scale_data,
+                                             bin_cnt,
+                                             round_type,
+                                             num,
+                                             in_dims[0],
+                                             in_dims[1],
+                                             out_data);
    }
  }
 };

--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -51,16 +51,11 @@ inline HOSTDEVICE T roundWithTiesToEven(T x) {
 template <typename T>
 class QuantTensorFunctor {
 public:
-  explicit QuantTensorFunctor(const T bin_cnt, const int round_type,
-                              const T inv_s)
-      : bin_cnt_(bin_cnt), round_type_(round_type), inv_s_(inv_s) {}
+  explicit QuantTensorFunctor(const T bin_cnt, const T inv_s)
+      : bin_cnt_(bin_cnt), inv_s_(inv_s) {}
  HOSTDEVICE T operator()(const T x) const {
    T out = bin_cnt_ * inv_s_ * x;
-    if (round_type_ == 0) {
-      out = roundWithTiesToEven(out);
-    } else if (round_type_ == 1) {
-      out = std::round(out);
-    }
+    out = roundWithTiesToEven(out);
    T max_bound = bin_cnt_;
    T min_bound = -bin_cnt_ - static_cast<T>(1);
    out = out > max_bound ? max_bound : out;
@@ -70,82 +65,101 @@ class QuantTensorFunctor {

 private:
  T bin_cnt_;
-  int round_type_;
  T inv_s_;
 };

 template <typename DeviceContext, typename T>
 struct FindAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
+  void operator()(const DeviceContext &ctx, const T *in, const int num, T *out);
 };

 template <typename DeviceContext, typename T>
 struct ClipAndFakeQuantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  const int round_type, framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out);
 };

 template <typename DeviceContext, typename T>
 struct ClipAndFakeQuantDequantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  int round_type, framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  int round_type,
+                  framework::Tensor *out);
 };

 template <typename DeviceContext, typename T>
 struct FindRangeAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &cur_scale,
+                  const framework::Tensor &last_scale,
+                  const framework::Tensor &iter,
+                  const int window_size,
+                  framework::Tensor *scales_arr,
+                  framework::Tensor *out_scale);
 };

 template <typename DeviceContext, typename T>
 struct FindChannelAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in_tensor,
-                  const int quant_axis, T* out_abs_max);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in_tensor,
+                  const int quant_axis,
+                  T *out_abs_max);
 };

 template <typename DeviceContext, typename T>
 struct ChannelClipAndFakeQuantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  const int round_type, const int quant_axis,
-                  framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out);
 };

 template <typename DeviceContext, typename T>
 struct ChannelClipFakeQuantDequantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  int round_type, const int quant_axis, framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out);
 };

 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state,
-                  const framework::Tensor& cur_scale,
-                  framework::Tensor* out_state, framework::Tensor* out_accum,
-                  framework::Tensor* out_scale);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in_accum,
+                  const framework::Tensor &in_state,
+                  const framework::Tensor &cur_scale,
+                  framework::Tensor *out_state,
+                  framework::Tensor *out_accum,
+                  framework::Tensor *out_scale);
 };

 template <typename DeviceContext, typename T>
 class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    T *out_s = out_scale->mutable_data<T>(context.GetPlace());

    int bit_length = context.Attr<int>("bit_length");
    int round_type = context.Attr<int>("round_type");
    int bin_cnt = std::pow(2, bit_length - 1) - 1;

-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    const T* in_data = in->data<T>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    const T *in_data = in->data<T>();
    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
    RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
  }
@@ -153,20 +167,25 @@ class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
  virtual ~FakeAbsMaxKernelBase() = default;

 protected:
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& scale, int bin_cnt,
-                              int round_type, framework::Tensor* out) const = 0;
+  virtual void RunClipFunctor(const DeviceContext &dev_ctx,
+                              const framework::Tensor &in,
+                              const framework::Tensor &scale,
+                              int bin_cnt,
+                              int round_type,
+                              framework::Tensor *out) const = 0;
 };

 template <typename DeviceContext, typename T>
 class FakeQuantizeAbsMaxKernel : public FakeAbsMaxKernelBase<DeviceContext, T> {
 protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, scale, bin_cnt,
-                                                round_type, out);
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, in, scale, bin_cnt, round_type, out);
  }
 };

@@ -174,9 +193,12 @@ template <typename DeviceContext, typename T>
 class FakeQuantizeDequantizeAbsMaxKernel
    : public FakeAbsMaxKernelBase<DeviceContext, T> {
 protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
    ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(
        dev_ctx, in, scale, bin_cnt, round_type, out);
  }
@@ -185,11 +207,11 @@ class FakeQuantizeDequantizeAbsMaxKernel
 template <typename DeviceContext, typename T>
 class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");

-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
    out->mutable_data<T>(context.GetPlace());

    int bit_length = context.Attr<int>("bit_length");
@@ -198,11 +220,11 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
    int quant_axis = context.Attr<int>("quant_axis");
    bool is_test = context.Attr<bool>("is_test");

-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();
    if (!is_test) {
-      T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
-      FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                   out_scale_data);
+      T *out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+      FindChannelAbsMaxFunctor<DeviceContext, T>()(
+          dev_ctx, *in, quant_axis, out_scale_data);
    }
    ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
        dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
@@ -213,12 +235,12 @@ template <typename DeviceContext, typename T>
 class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
    : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    T *out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    auto &dev_ctx = context.template device_context<DeviceContext>();
    out->mutable_data<T>(dev_ctx.GetPlace());

    int bit_length = context.Attr<int>("bit_length");
@@ -226,8 +248,8 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
    int bin_cnt = std::pow(2, bit_length - 1) - 1;
    int quant_axis = context.Attr<int>("quant_axis");

-    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                 out_scale_data);
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, *in, quant_axis, out_scale_data);

    ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
        dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
@@ -237,60 +259,64 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("InScale");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *in_scale = context.Input<framework::Tensor>("InScale");

-    auto* out = context.Output<framework::Tensor>("Out");
+    auto *out = context.Output<framework::Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());

    bool is_test = context.Attr<bool>("is_test");
    int bit_length = context.Attr<int>("bit_length");
    int round_type = context.Attr<int>("round_type");
    int bin_cnt = std::pow(2, bit_length - 1) - 1;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();

    // testing
    if (is_test) {
-      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
-                                                  bin_cnt, round_type, out);
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(
+          dev_ctx, *in, *in_scale, bin_cnt, round_type, out);
      return;
    }

    // training
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    auto* out_scales = context.Output<framework::Tensor>("OutScales");
-    auto* iter = context.Input<framework::Tensor>("Iter");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_scales = context.Output<framework::Tensor>("OutScales");
+    auto *iter = context.Input<framework::Tensor>("Iter");

    int window_size = context.Attr<int>("window_size");
    out_scale->mutable_data<T>(context.GetPlace());

    framework::Tensor cur_scale;
-    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
-    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
-                                               *iter, window_size, out_scales,
+    T *cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
+    FindAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
+    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                               cur_scale,
+                                               *in_scale,
+                                               *iter,
+                                               window_size,
+                                               out_scales,
                                               out_scale);
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
-                                                bin_cnt, round_type, out);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
  }
 };

 template <typename DeviceContext, typename T>
 class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("InScale");
-    auto* out = context.Output<framework::Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *in_scale = context.Input<framework::Tensor>("InScale");
+    auto *out = context.Output<framework::Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());

    bool is_test = context.Attr<bool>("is_test");
    int bit_length = context.Attr<int>("bit_length");
    int round_type = context.Attr<int>("round_type");
    int bin_cnt = std::pow(2, bit_length - 1) - 1;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();

    // testing
    if (is_test) {
@@ -299,25 +325,30 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
    }

    // training
-    auto* in_accum = context.Input<framework::Tensor>("InAccum");
-    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto *in_accum = context.Input<framework::Tensor>("InAccum");
+    auto *in_state = context.Input<framework::Tensor>("InState");
    auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+    T *cur_scale_data = static_cast<T *>(cur_scale->ptr());

-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
+    FindAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), cur_scale_data);

-    auto* out_state = context.Output<framework::Tensor>("OutState");
-    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_state = context.Output<framework::Tensor>("OutState");
+    auto *out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
    out_state->mutable_data<T>(context.GetPlace());
    out_accum->mutable_data<T>(context.GetPlace());
    out_scale->mutable_data<T>(context.GetPlace());
    float moving_rate = context.Attr<float>("moving_rate");

-    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
-        out_accum, out_scale);
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                                       *in_accum,
+                                                       *in_state,
+                                                       cur_scale_data,
+                                                       moving_rate,
+                                                       out_state,
+                                                       out_accum,
+                                                       out_scale);

    RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
  }
@@ -325,21 +356,26 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
  virtual ~FakeMovingAverageAbsMaxKernelBase() = default;

 protected:
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& in_scale, int bin_cnt,
-                              int round_type, framework::Tensor* out) const = 0;
+  virtual void RunClipFunctor(const DeviceContext &dev_ctx,
+                              const framework::Tensor &in,
+                              const framework::Tensor &in_scale,
+                              int bin_cnt,
+                              int round_type,
+                              framework::Tensor *out) const = 0;
 };

 template <typename DeviceContext, typename T>
 class FakeQuantizeMovingAverageAbsMaxKernel
    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
 protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, in_scale, bin_cnt,
-                                                round_type, out);
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &in_scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, in, in_scale, bin_cnt, round_type, out);
  }
 };

@@ -347,9 +383,12 @@ template <typename DeviceContext, typename T>
 class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
 protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &in_scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
    ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(
        dev_ctx, in, in_scale, bin_cnt, round_type, out);
  }
@@ -358,12 +397,12 @@ class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
 template <typename DeviceContext, typename T>
 class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto &dev_ctx = context.template device_context<DeviceContext>();

    if (context.HasOutput("Out")) {
-      auto* out = context.Output<framework::Tensor>("Out");
+      auto *out = context.Output<framework::Tensor>("Out");
      out->mutable_data<T>(context.GetPlace());
      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
    }
@@ -375,40 +414,46 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
    }

    // training
-    auto* in_accum = context.Input<framework::Tensor>("InAccum");
-    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto *in_accum = context.Input<framework::Tensor>("InAccum");
+    auto *in_state = context.Input<framework::Tensor>("InState");
    auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+    T *cur_scale_data = static_cast<T *>(cur_scale->ptr());

-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
+    FindAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), cur_scale_data);

-    auto* out_state = context.Output<framework::Tensor>("OutState");
-    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_state = context.Output<framework::Tensor>("OutState");
+    auto *out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
    out_state->mutable_data<T>(context.GetPlace());
    out_accum->mutable_data<T>(context.GetPlace());
    out_scale->mutable_data<T>(context.GetPlace());
    float moving_rate = context.Attr<float>("moving_rate");

-    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
-        out_accum, out_scale);
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                                       *in_accum,
+                                                       *in_state,
+                                                       cur_scale_data,
+                                                       moving_rate,
+                                                       out_state,
+                                                       out_accum,
+                                                       out_scale);
  }
 };

 template <typename DeviceContext, typename T>
 class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *d_out =
        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
    auto x_grad_name = framework::GradVarName("X");
-    auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet(
-                                     "StrightThroughEstimatorGradKernel "
-                                     "doesn't have the output named %s.",
-                                     x_grad_name));
+    auto *d_x = context.Output<framework::LoDTensor>(x_grad_name);
+    PADDLE_ENFORCE_NOT_NULL(d_x,
+                            platform::errors::PreconditionNotMet(
+                                "StrightThroughEstimatorGradKernel "
+                                "doesn't have the output named %s.",
+                                x_grad_name));

    // Initialize dx as same as d_out
    d_x->mutable_data<T>(context.GetPlace());

--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -26,14 +26,17 @@ namespace operators {

 template <typename T>
 struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor* scale,
-                  T max_range, const int quant_axis, framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &dev_ctx,
+                  const framework::Tensor *in,
+                  const framework::Tensor *scale,
+                  T max_range,
+                  const int quant_axis,
+                  framework::Tensor *out) {
    // Dequant op is before quantized op
    // Dequantize the weight of quantized op
    auto in_dims = in->dims();
    const int64_t channel = in_dims[quant_axis];
-    const T* scale_factor = scale->data<T>();
+    const T *scale_factor = scale->data<T>();
    if (quant_axis == 0) {
      for (int64_t i = 0; i < channel; i++) {
        T s = scale_factor[i];
@@ -41,7 +44,7 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
        framework::Tensor one_channel_out = out->Slice(i, i + 1);
        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        auto& dev = *dev_ctx.eigen_device();
+        auto &dev = *dev_ctx.eigen_device();
        out_e.device(dev) = in_e * s / max_range;
      }
    } else if (quant_axis == 1) {
@@ -51,12 +54,12 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
      }
      int64_t step_i = in->numel() / out_iter;
      int64_t step_j = in->numel() / (out_iter * channel);
-      auto* in_data = in->data<T>();
-      auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+      auto *in_data = in->data<T>();
+      auto *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
      for (int64_t i = 0; i < out_iter; i++) {
        for (int64_t j = 0; j < channel; j++) {
-          auto* cur_in = in_data + i * step_i + j * step_j;
-          auto* cur_out = out_data + i * step_i + j * step_j;
+          auto *cur_in = in_data + i * step_i + j * step_j;
+          auto *cur_out = out_data + i * step_i + j * step_j;
          T s = scale_factor[j];
          for (int64_t k = 0; k < step_j; k++) {
            *cur_out = (*cur_in) * s / max_range;
@@ -75,11 +78,11 @@ template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, double>;
 class QuantizeLinearOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "QuantizeLinear");
    OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "QuantizeLinear");
-    OP_INOUT_CHECK(ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint",
-                   "QuantizeLinear");
+    OP_INOUT_CHECK(
+        ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint", "QuantizeLinear");
    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "QuantizeLinear");
    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
@@ -95,7 +98,7 @@ class QuantizeLinearOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
  }
@@ -116,9 +119,10 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
                 "For conv2d, depthwise_conv2d, conv2d_transpose "
                 "and mul, the quant_axis is equal to the cout axis.")
        .SetDefault(0)
-        .AddCustomChecker([](const int& quant_axis) {
+        .AddCustomChecker([](const int &quant_axis) {
          PADDLE_ENFORCE_EQ(
-              quant_axis == 0 || quant_axis == 1 || quant_axis == -1, true,
+              quant_axis == 0 || quant_axis == 1 || quant_axis == -1,
+              true,
              platform::errors::InvalidArgument(
                  "'quant_axis' should be 0 or 1, but "
                  "the received is %d",
@@ -126,8 +130,9 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                            platform::errors::InvalidArgument(
                                "'bit_length' should be between 1 and 16, but "
                                "the received is %d",
@@ -140,13 +145,17 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
        "round(2.5)=3")
        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
    AddAttr<bool>("is_test",
                  "(bool, default false) Set to true for inference only, false "
                  "for training. Some layers may run faster when this is true.")
@@ -170,14 +179,18 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;

 REGISTER_OPERATOR(
-    quantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
+    quantize_linear,
+    ops::QuantizeLinearOp,
+    ops::QuantizeLinearOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

 REGISTER_OP_CPU_KERNEL(quantize_linear, ops::QuantizeLinearKernel<CPU, float>);

 REGISTER_OPERATOR(
-    dequantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
+    dequantize_linear,
+    ops::QuantizeLinearOp,
+    ops::QuantizeLinearOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);


--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -121,8 +121,7 @@ class PostTrainingQuantization(object):
                 algo="KL",
                 hist_percent=0.99999,
                 quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
-                 weight_round_algo='round',
-                 round_type='TiesToEven',
+                 round_type='round',
                 learning_rate=0.001,
                 is_full_quantize=False,
                 bias_correction=False,
@@ -181,14 +180,10 @@ class PostTrainingQuantization(object):
            quantizable_op_type(list[str], optional): List the type of ops 
                that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
                "mul"].
-            weight_round_algo(str, optional): The method of converting the quantized weights
+            round_type(str, optional): The method of converting the quantized weights
                value float->int. Currently supports ['round', 'adaround'] methods.
                Default is `round`, which is rounding nearest to the integer.
                'adaround' is refer to https://arxiv.org/abs/2004.10568.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
            learning_rate(float, optional): The learning rate of adaround method.
            is_full_quantized(bool, optional): If set is_full_quantized as True, 
                apply quantization to all supported quantizable op type. If set
@@ -269,10 +264,8 @@ class PostTrainingQuantization(object):
        self._support_algo_type = [
            'KL', 'hist', 'avg', 'mse', 'emd', 'abs_max', 'min_max'
        ]
-        assert round_type in ['TiesToEven', 'TiesAwayFromZero']
+        assert round_type in ['adaround', 'round']
        self._round_type = round_type
-        assert weight_round_algo in ['adaround', 'round']
-        self._weight_round_algo = weight_round_algo
        self._learning_rate = learning_rate
        self._dynamic_quantize_op_type = ['lstm']
        self._support_quantize_op_type = \
@@ -414,7 +407,7 @@ class PostTrainingQuantization(object):
        if self._algo in ["KL", "hist"]:
            self._calculate_kl_hist_threshold()

-        if self._weight_round_algo == 'adaround':
+        if self._round_type == 'adaround':
            self._adaround_apply()

        self._reset_activation_persistable()
@@ -651,7 +644,6 @@ class PostTrainingQuantization(object):
                                float(np.max(np.abs(var_tensor[i]))))
                self._quantized_threshold[var_name] = abs_max_value
        _logger.info("MSE searching stage ...")
-        distribution = np.round if self._round_type == 'TiesToEven' else utils.round_c
        for var_name in self._quantized_act_var_name:
            var_tensor = utils.load_variable_data(self._scope, var_name)
            var_tensor = var_tensor.flatten()
@@ -664,9 +656,14 @@ class PostTrainingQuantization(object):
                scale = s * abs_max_value
                s += 0.02
                bins = 2**(self._activation_bits - 1) - 1
-                quant_var = np.clip(distribution(var_tensor / scale * bins),
-                                    -bins - 1, bins)
-                quant_dequant_var = quant_var / bins * scale
+                if self._onnx_format:
+                    quant_var = np.clip(distribution(var_tensor / scale * bins),
+                                        -bins - 1, bins)
+                    quant_dequant_var = quant_var / bins * scale
+                else:
+                    quant_dequant_var = np.round(
+                        np.clip(var_tensor, 0.0, scale) / scale *
+                        bins) / bins * scale
                mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
                if mse_loss <= self._best_calibration_loss[var_name]:
                    self._best_calibration_loss[var_name] = mse_loss
@@ -691,7 +688,6 @@ class PostTrainingQuantization(object):
                                float(np.max(np.abs(var_tensor[i]))))
                self._quantized_threshold[var_name] = abs_max_value
        _logger.info("EMD searching stage ...")
-        distribution = np.round if self._round_type == 'TiesToEven' else utils.round_c
        for var_name in self._quantized_act_var_name:
            var_tensor = utils.load_variable_data(self._scope, var_name)
            var_tensor = var_tensor.flatten()
@@ -704,9 +700,14 @@ class PostTrainingQuantization(object):
                scale = s * abs_max_value
                s += 0.02
                bins = 2**(self._activation_bits - 1) - 1
-                quant_var = np.clip(distribution(var_tensor / scale * bins),
-                                    -bins - 1, bins)
-                quant_dequant_var = quant_var / bins * scale
+                if self._onnx_format:
+                    quant_var = np.clip(distribution(var_tensor / scale * bins),
+                                        -bins - 1, bins)
+                    quant_dequant_var = quant_var / bins * scale
+                else:
+                    quant_dequant_var = np.round(
+                        np.clip(var_tensor, 0.0, scale) / scale *
+                        bins) / bins * scale
                emd_loss = np.abs(
                    np.mean(var_tensor) - np.mean(quant_dequant_var)) + np.abs(
                        np.std(var_tensor) - np.std(quant_dequant_var))
@@ -918,8 +919,7 @@ class PostTrainingQuantization(object):
                activation_bits=self._activation_bits,
                activation_quantize_type=self._activation_quantize_type,
                weight_quantize_type=self._weight_quantize_type,
-                quantizable_op_type=major_quantizable_op_types,
-                round_type=self._round_type)
+                quantizable_op_type=major_quantizable_op_types)
        else:
            transform_pass = QuantizationTransformPassV2(
                scope=self._scope,
@@ -928,8 +928,7 @@ class PostTrainingQuantization(object):
                activation_bits=self._activation_bits,
                activation_quantize_type=self._activation_quantize_type,
                weight_quantize_type=self._weight_quantize_type,
-                quantizable_op_type=major_quantizable_op_types,
-                round_type=self._round_type)
+                quantizable_op_type=major_quantizable_op_types)

        for sub_graph in graph.all_sub_graphs():
            # Insert fake_quant/fake_dequantize op must in test graph, so
@@ -946,15 +945,13 @@ class PostTrainingQuantization(object):
            add_quant_dequant_pass = AddQuantDequantPass(
                scope=self._scope,
                place=self._place,
-                quantizable_op_type=minor_quantizable_op_types,
-                round_type=self._round_type)
+                quantizable_op_type=minor_quantizable_op_types)
        else:
            add_quant_dequant_pass = AddQuantDequantPassV2(
                scope=self._scope,
                place=self._place,
                quantizable_op_type=minor_quantizable_op_types,
-                is_full_quantized=self._is_full_quantize,
-                round_type=self._round_type)
+                is_full_quantized=self._is_full_quantize)

        for sub_graph in graph.all_sub_graphs():
            sub_graph._for_test = True
@@ -979,7 +976,6 @@ class PostTrainingQuantization(object):
                place=self._place,
                bias_correction=self._bias_correction,
                weight_bits=self._weight_bits,
-                weight_round_algo=self._weight_round_algo,
                round_type=self._round_type,
                activation_bits=self._activation_bits,
                weight_quantize_type=self._weight_quantize_type,

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -119,7 +119,6 @@ class QuantizationTransformPass(object):
                 moving_rate=0.9,
                 skip_pattern=['skip_quant'],
                 quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
-                 round_type='TiesToEven',
                 weight_quantize_func=None,
                 act_quantize_func=None,
                 weight_preprocess_func=None,
@@ -157,10 +156,6 @@ class QuantizationTransformPass(object):
            quantizable_op_type(list[str]): List the type of ops that will be quantized. 
                Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
            weight_quantize_func(function): Function that defines how to quantize weight.
                Using this can quickly test if user's quantization method works or not.
                In this function, user should both define quantization function and
@@ -211,7 +206,6 @@ class QuantizationTransformPass(object):
        self._weight_bits = weight_bits
        self._activation_bits = activation_bits
        self._skip_pattern = skip_pattern
-        self._round_type = round_type
        self._weight_quantize_func = weight_quantize_func
        self._act_quantize_func = act_quantize_func
        self._weight_preprocess_func = weight_preprocess_func
@@ -465,12 +459,10 @@ class QuantizationTransformPass(object):
        _init_var_node(scale_var_node,
                       np.zeros(scale_var_node.shape(), dtype=data_type),
                       self._scope, self._place)
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
        quant_op_node = graph.create_op_node(
            op_type='fake_quantize_abs_max',
            attrs={
                'bit_length': quant_bits,
-                'round_type': round_type,
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
            },
            inputs={'X': var_node},
@@ -525,11 +517,9 @@ class QuantizationTransformPass(object):

            inputs['Iter'] = self._global_step
            outputs['OutScales'] = scales_node
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
        attrs = {
            'window_size': self._window_size,
            'bit_length': quant_bits,
-            'round_type': round_type,
            'is_test': self._is_test,
            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
        }
@@ -600,10 +590,8 @@ class QuantizationTransformPass(object):
            outs['OutState'] = state_out_node
            outs['OutAccum'] = accum_out_node

-        round_type = 0 if self._round_type == 'TiesToEven' else 1
        attrs = {
            'bit_length': quant_bits,
-            'round_type': round_type,
            'moving_rate': self._moving_rate,
            'is_test': self._is_test,
            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
@@ -650,12 +638,10 @@ class QuantizationTransformPass(object):
        _init_var_node(scale_var_node,
                       np.zeros(scale_var_node.shape(), dtype=data_type),
                       self._scope, self._place)
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
        quant_op_node = graph.create_op_node(
            op_type='fake_channel_wise_quantize_abs_max',
            attrs={
                'bit_length': quant_bits,
-                'round_type': round_type,
                'quant_axis': quant_axis,
                'is_test': self._is_test,
                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
@@ -949,8 +935,7 @@ class QuantizationFreezePass(object):
                 bias_correction=False,
                 weight_bits=8,
                 activation_bits=8,
-                 weight_round_algo='round',
-                 round_type='TiesToEven',
+                 round_type='round',
                 weight_quantize_type='abs_max',
                 quantizable_op_type=None):
        """
@@ -968,14 +953,10 @@ class QuantizationFreezePass(object):
                 https://arxiv.org/abs/1810.05723.
            weight_bits(int): quantization bit number for weights.
            activation_bits(int): quantization bit number for activation.
-            weight_round_algo(str, optional): The method of converting the quantized weights
+            round_type(str, optional): The method of converting the quantized weights
                value float->int. Currently supports ['round', 'adaround'] methods.
                Default is `round`, which is rounding nearest to the integer.
                'adaround' is refer to https://arxiv.org/abs/2004.10568.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
            weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
                'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight, 
                since weights are fixed once the model is well trained.
@@ -991,7 +972,6 @@ class QuantizationFreezePass(object):
        self._place = _get_paddle_place(place)
        self._weight_bits = weight_bits
        self._activation_bits = activation_bits
-        self._weight_round_algo = weight_round_algo
        self._round_type = round_type
        self._weight_quantize_type = weight_quantize_type
        self._fake_quant_op_names = _fake_quant_op_list
@@ -1039,7 +1019,7 @@ class QuantizationFreezePass(object):
                        scale_v = scale_v.tolist()
                    self._quant_var_scale_map[input_arg_name] = scale_v
                    # Quantize weight and restore
-                    if self._weight_round_algo == 'round':
+                    if self._round_type == 'round':
                        param_v = self._load_var(input_arg_name)
                        if any(
                                _check_grandchild_op_node(op_node, op)
@@ -1049,7 +1029,8 @@ class QuantizationFreezePass(object):
                            quant_axis = 0
                        quantized_param_v = utils.quant_tensor(
                            param_v.copy(), scale_v, quant_axis,
-                            self._weight_bits, self._round_type)
+                            self._weight_bits)
+                        quantized_param_v = np.round(quantized_param_v)
                        # Weight bias correction
                        if self._bias_correction == True:
                            quantized_param_v = utils.bias_correction_w(
@@ -1058,6 +1039,7 @@ class QuantizationFreezePass(object):
                                scale_v,
                                quant_axis,
                                weight_bits=self._weight_bits)
+                            quantized_param_v = np.round(quantized_param_v)
                        self._restore_var(input_arg_name, quantized_param_v)
                    self._remove_fake_quant_and_dequant_op(graph, op_node)

@@ -1600,8 +1582,7 @@ class AddQuantDequantPass(object):
                 quant_bits=8,
                 skip_pattern=["skip_quant"],
                 quantizable_op_type=["elementwise_add", "pool2d"],
-                 is_full_quantized=False,
-                 round_type='TiesToEven'):
+                 is_full_quantized=False):
        """
        Constructor.

@@ -1623,10 +1604,6 @@ class AddQuantDequantPass(object):
                quantization to all supported quantizable op type. If set is_full_quantized
                as False, only apply quantization to the op type according to the input 
                quantizable_op_type.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
        """
        self._scope = scope
        self._place = _get_paddle_place(place)
@@ -1634,7 +1611,6 @@ class AddQuantDequantPass(object):
        self._quant_bits = quant_bits
        self._is_test = None
        self._skip_pattern = skip_pattern
-        self._round_type = round_type

        if is_full_quantized:
            self._quantizable_op_type = utils._act_supported_quantizable_op_type
@@ -1769,10 +1745,8 @@ class AddQuantDequantPass(object):
            outs['OutState'] = state_out_node
            outs['OutAccum'] = accum_out_node

-        round_type = 0 if self._round_type == 'TiesToEven' else 1
        attrs = {
            'bit_length': quant_bits,
-            'round_type': round_type,
            'moving_rate': self._moving_rate,
            'is_test': self._is_test,
            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
@@ -1812,10 +1786,6 @@ class InsertQuantizeLinear(object):
            Default is -1.
        channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
        is_test(bool, optional): Whether quantization with training or not. Default is True.
-        round_type(str, optional): The method of converting the tensor value float->int.
-            Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-            Default is `TiesToEven`, which is rounding to nearest ties to even. 
-            'TiesAwayFromZero' is rounding to nearest ties away from zero.
    """

    def __init__(self,
@@ -1824,15 +1794,13 @@ class InsertQuantizeLinear(object):
                 quant_bits=8,
                 quant_axis=-1,
                 channel_wise=False,
-                 is_test=True,
-                 round_type='TiesToEven'):
+                 is_test=True):
        self._place = place
        self._scope = scope
        self.quant_bits = quant_bits
        self.quant_axis = quant_axis
        self.channel_wise = channel_wise
        self._is_test = is_test
-        self._round_type = round_type

    def insert_quant_op(self, graph, var_node):
        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
@@ -1875,12 +1843,7 @@ class InsertQuantizeLinear(object):
        if zero_point_node is not None:
            inputs["ZeroPoint"] = zero_point_node

-        round_type = 0 if self._round_type == 'TiesToEven' else 1
-        attrs = {
-            "quant_axis": self.quant_axis,
-            "bit_length": self.quant_bits,
-            "round_type": round_type
-        }
+        attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
        outputs = {"Y": quant_var_node}
        if not self._is_test:
            attrs["is_test"] = self._is_test
@@ -1985,7 +1948,6 @@ class QuantizationTransformPassV2(object):
                 moving_rate=0.9,
                 skip_pattern=['skip_quant'],
                 quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
-                 round_type='TiesToEven',
                 weight_quantize_func=None,
                 act_quantize_func=None,
                 weight_preprocess_func=None,
@@ -2021,10 +1983,6 @@ class QuantizationTransformPassV2(object):
            quantizable_op_type(list[str]): List the type of ops that will be quantized. 
                Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
            weight_quantize_func(function): Function that defines how to quantize weight.
                Using this can quickly test if user's quantization method works or not.
                In this function, user should both define quantization function and
@@ -2074,7 +2032,6 @@ class QuantizationTransformPassV2(object):
        self._weight_bits = weight_bits
        self._activation_bits = activation_bits
        self._skip_pattern = skip_pattern
-        self._round_type = round_type
        self._weight_quantize_func = weight_quantize_func
        self._act_quantize_func = act_quantize_func
        self._weight_preprocess_func = weight_preprocess_func
@@ -2198,8 +2155,7 @@ class QuantizationTransformPassV2(object):
                    quant_bits=quant_bits,
                    quant_axis=quant_axis,
                    channel_wise=channel_wise,
-                    is_test=self._is_test,
-                    round_type=self._round_type)
+                    is_test=self._is_test)
                quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                    graph, var_node)
                dequant_var_node = insert_quant_pass.insert_dequant_op(
@@ -2307,8 +2263,7 @@ class AddQuantDequantPassV2(object):
                 quant_bits=8,
                 skip_pattern=["skip_quant"],
                 quantizable_op_type=["elementwise_add", "pool2d"],
-                 is_full_quantized=False,
-                 round_type='TiesToEven'):
+                 is_full_quantized=False):
        """
        Args:
            scope(paddle.Scope): The scope is used to initialize these new parameters.
@@ -2328,10 +2283,6 @@ class AddQuantDequantPassV2(object):
                quantization to all supported quantizable op type. If set is_full_quantized
                as False, only apply quantization to the op type according to the input 
                quantizable_op_type.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
        
        Examples:
        .. code-block:: python
@@ -2354,7 +2305,6 @@ class AddQuantDequantPassV2(object):
        self._quant_bits = quant_bits
        self._is_test = None
        self._skip_pattern = skip_pattern
-        self._round_type = round_type

        if is_full_quantized:
            self._quantizable_op_type = utils._act_supported_quantizable_op_type
@@ -2427,8 +2377,7 @@ class AddQuantDequantPassV2(object):
                                quant_bits=self._quant_bits,
                                quant_axis=-1,
                                channel_wise=False,
-                                is_test=self._is_test,
-                                round_type=self._round_type)
+                                is_test=self._is_test)
                            quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                                graph, in_node)
                            dequant_var_node = insert_quant_pass.insert_dequant_op(
@@ -2511,8 +2460,6 @@ class ReplaceFakeQuantDequantPass(object):
            "quant_axis") else -1
        bit_length = op.op().attr("bit_length") if op.op().has_attr(
            "bit_length") else 8
-        round_type = op.op().attr("round_type") if op.op().has_attr(
-            "round_type") else 0

        zero_point_node = None
        quanted_node = x_node
@@ -2534,8 +2481,7 @@ class ReplaceFakeQuantDequantPass(object):
        quant_op_node = graph.create_op_node(op_type="quantize_linear",
                                             attrs={
                                                 "quant_axis": quant_axis,
-                                                 "bit_length": bit_length,
-                                                 "round_type": round_type
+                                                 "bit_length": bit_length
                                             },
                                             inputs={
                                                 "X": x_node,
@@ -2654,11 +2600,11 @@ class QuantWeightPass(object):
                param_v = self._load_var(x_node.name())
                quant_axis = _op.op().attr("quant_axis")
                bits_length = _op.op().attr("bit_length")
-                round_type = _op.op().attr("round_type") if _op.op().has_attr(
-                    "round_type") else 0
-                quantized_param_v = utils.quant_tensor(param_v.copy(), scale_v,
-                                                       quant_axis, bits_length,
-                                                       round_type)
+                quantized_param_v = utils.quant_tensor(param_v.copy(),
+                                                       scale_v,
+                                                       quant_axis,
+                                                       bits_length,
+                                                       onnx_format=True)
                if self._bias_correction == True:
                    quantized_param_v = utils.bias_correction_w(
                        param_v,

--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -321,39 +321,41 @@ def set_variable_data(scope, place, var_name, np_value):
        tensor.set(np_value, place)


-def round_c_single_element(val):
-    dtype = type(val)
-    if val >= 0:
-        return dtype(np.floor(val + 0.5))
-    return dtype(np.ceil(val - 0.5))
+def quant_tensor(x, scale, quant_axis=0, weight_bits=8, onnx_format=False):
+    # symmetry quant
+    def _clip(x, scale):
+        x[x > scale] = scale
+        x[x < -scale] = -scale
+        return x

-
-# rounding to nearest ties away from zero
-round_c = np.vectorize(round_c_single_element)
-
-
-def quant_tensor(x,
-                 scale,
-                 quant_axis=0,
-                 weight_bits=8,
-                 round_type='TiesToEven'):
    assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
-    distribution = np.round if round_type == 'TiesToEven' else round_c
    bnt = (1 << (weight_bits - 1)) - 1
    if isinstance(scale, list):
        for i, s in enumerate(scale):
            if s == 0.0:
                s = 1e-8
            if quant_axis == 0:
-                x[i] = distribution(x[i] / s * bnt)
-                x[i] = np.clip(x[i], -bnt - 1, bnt)
+                if onnx_format:
+                    x[i] = np.round(x[i] / s * bnt)
+                    x[i] = np.clip(x[i], -bnt - 1, bnt)
+                else:
+                    x[i] = _clip(x[i], s)
+                    x[i] = x[i] / s * bnt
            else:
-                x[:, i] = distribution(x[:, i] / s * bnt)
-                x[:, i] = np.clip(x[:, i], -bnt - 1, bnt)
+                if onnx_format:
+                    x[:, i] = np.round(x[:, i] / s * bnt)
+                    x[:, i] = np.clip(x[:, i], -bnt - 1, bnt)
+                else:
+                    x[:, i] = _clip(x[:, i], s)
+                    x[:, i] = x[:, i] / s * bnt
    else:
        scale = 1e-8 if scale == 0.0 else scale
-        x = distribution(x / scale * bnt)
-        x = np.clip(x, -bnt - 1, bnt)
+        if onnx_format:
+            x = np.round(x / scale * bnt)
+            x = np.clip(x, -bnt - 1, bnt)
+        else:
+            x = _clip(x, scale)
+            x = x / scale * bnt
    return x



--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -165,7 +165,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                 model_path,
                                 data_path,
                                 algo="KL",
-                                 weight_round_algo="round",
+                                 round_type="round",
                                 quantizable_op_type=["conv2d"],
                                 is_full_quantize=False,
                                 is_use_cache_file=False,
@@ -185,7 +185,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                       batch_nums=batch_nums,
                                       algo=algo,
                                       quantizable_op_type=quantizable_op_type,
-                                       weight_round_algo=weight_round_algo,
+                                       round_type=round_type,
                                       is_full_quantize=is_full_quantize,
                                       optimize_model=is_optimize_model,
                                       onnx_format=onnx_format,
@@ -201,7 +201,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                 data_url,
                 data_md5,
                 algo,
-                 weight_round_algo,
+                 round_type,
                 quantizable_op_type,
                 is_full_quantize,
                 is_use_cache_file,
@@ -224,7 +224,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
        print("Start post training quantization for {0} on {1} samples ...".
              format(model_name, quant_iterations))
        self.generate_quantized_model(fp32_model_path, data_path, algo,
-                                      weight_round_algo, quantizable_op_type,
+                                      round_type, quantizable_op_type,
                                      is_full_quantize, is_use_cache_file,
                                      is_optimize_model, quant_iterations,
                                      onnx_format)
@@ -255,7 +255,7 @@ class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
        data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
        data_md5 = "add84c754e9b792fea1fbd728d134ab7"
        algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["mul", "lstm"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -264,7 +264,7 @@ class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
        infer_iterations = 100
        quant_iterations = 10
        self.run_test(model_name, model_url, model_md5, data_name, data_url,
-                      data_md5, algo, weight_round_algo, quantizable_op_type,
+                      data_md5, algo, round_type, quantizable_op_type,
                      is_full_quantize, is_use_cache_file, is_optimize_model,
                      diff_threshold, infer_iterations, quant_iterations)

@@ -279,7 +279,7 @@ class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
        data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
        data_md5 = "add84c754e9b792fea1fbd728d134ab7"
        algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["mul", "lstm"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -295,7 +295,7 @@ class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
                      data_url,
                      data_md5,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      quantizable_op_type,
                      is_full_quantize,
                      is_use_cache_file,

--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -108,7 +108,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
    def generate_quantized_model(self,
                                 model_path,
                                 algo="KL",
-                                 weight_round_algo="round",
+                                 round_type="round",
                                 quantizable_op_type=["conv2d"],
                                 is_full_quantize=False,
                                 is_use_cache_file=False,
@@ -130,7 +130,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                       batch_nums=batch_nums,
                                       algo=algo,
                                       quantizable_op_type=quantizable_op_type,
-                                       weight_round_algo=weight_round_algo,
+                                       round_type=round_type,
                                       is_full_quantize=is_full_quantize,
                                       optimize_model=is_optimize_model,
                                       bias_correction=bias_correction,
@@ -145,7 +145,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                 data_url,
                 data_md5,
                 algo,
-                 weight_round_algo,
+                 round_type,
                 quantizable_op_type,
                 is_full_quantize,
                 is_use_cache_file,
@@ -169,11 +169,10 @@ class TestPostTrainingQuantization(unittest.TestCase):

        print("Start INT8 post training quantization for {0} on {1} images ...".
              format(model_name, quant_iterations * batch_size))
-        self.generate_quantized_model(origin_model_path, algo,
-                                      weight_round_algo, quantizable_op_type,
-                                      is_full_quantize, is_use_cache_file,
-                                      is_optimize_model, batch_size,
-                                      quant_iterations, onnx_format,
+        self.generate_quantized_model(origin_model_path, algo, round_type,
+                                      quantizable_op_type, is_full_quantize,
+                                      is_use_cache_file, is_optimize_model,
+                                      batch_size, quant_iterations, onnx_format,
                                      skip_tensor_list, bias_correction)

        print("Start INT8 inference for {0} on {1} images ...".format(
@@ -204,7 +203,7 @@ class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "KL"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -213,7 +212,7 @@ class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -226,7 +225,7 @@ class TestPostTraininghistForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "hist"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -235,7 +234,7 @@ class TestPostTraininghistForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -248,7 +247,7 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "mse"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -257,7 +256,7 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -270,7 +269,7 @@ class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "emd"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -279,7 +278,7 @@ class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -292,7 +291,7 @@ class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -301,7 +300,7 @@ class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -314,7 +313,7 @@ class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "abs_max"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "mul"]
        is_full_quantize = True
        is_use_cache_file = False
@@ -323,7 +322,7 @@ class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 10
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -336,7 +335,7 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "mse"
-        weight_round_algo = "adaround"
+        round_type = "adaround"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -350,7 +349,7 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
                      data_url,
                      data_md5,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      quantizable_op_type,
                      is_full_quantize,
                      is_use_cache_file,
@@ -369,7 +368,7 @@ class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "KL"
-        weight_round_algo = "adaround"
+        round_type = "adaround"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -378,7 +377,7 @@ class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
        batch_size = 10
        infer_iterations = 50
        quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold, batch_size,
                      infer_iterations, quant_iterations)
@@ -391,7 +390,7 @@ class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "mse"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -405,7 +404,7 @@ class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
                      data_url,
                      data_md5,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      quantizable_op_type,
                      is_full_quantize,
                      is_use_cache_file,
@@ -425,7 +424,7 @@ class TestPostTrainingmseForMnistONNXFormatFullQuant(
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "mse"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = True
        is_use_cache_file = False
@@ -439,7 +438,7 @@ class TestPostTrainingmseForMnistONNXFormatFullQuant(
                      data_url,
                      data_md5,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      quantizable_op_type,
                      is_full_quantize,
                      is_use_cache_file,
@@ -458,7 +457,7 @@ class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
        algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
        is_full_quantize = False
        is_use_cache_file = False
@@ -472,7 +471,7 @@ class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
                      data_url,
                      data_md5,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      quantizable_op_type,
                      is_full_quantize,
                      is_use_cache_file,

--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -242,7 +242,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                 model_path,
                                 quantizable_op_type,
                                 algo="KL",
-                                 weight_round_algo="round",
+                                 round_type="round",
                                 is_full_quantize=False,
                                 is_use_cache_file=False,
                                 is_optimize_model=False,
@@ -264,7 +264,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                       model_dir=model_path,
                                       algo=algo,
                                       quantizable_op_type=quantizable_op_type,
-                                       weight_round_algo=weight_round_algo,
+                                       round_type=round_type,
                                       is_full_quantize=is_full_quantize,
                                       optimize_model=is_optimize_model,
                                       onnx_format=onnx_format,
@@ -275,7 +275,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
    def run_test(self,
                 model,
                 algo,
-                 weight_round_algo,
+                 round_type,
                 data_urls,
                 data_md5s,
                 quantizable_op_type,
@@ -299,10 +299,9 @@ class TestPostTrainingQuantization(unittest.TestCase):
        print("Start INT8 post training quantization for {0} on {1} images ...".
              format(model, sample_iterations * batch_size))
        self.generate_quantized_model(model_cache_folder + "/model",
-                                      quantizable_op_type, algo,
-                                      weight_round_algo, is_full_quantize,
-                                      is_use_cache_file, is_optimize_model,
-                                      onnx_format)
+                                      quantizable_op_type, algo, round_type,
+                                      is_full_quantize, is_use_cache_file,
+                                      is_optimize_model, onnx_format)

        print("Start INT8 inference for {0} on {1} images ...".format(
            model, infer_iterations * batch_size))
@@ -330,7 +329,7 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
    def test_post_training_kl_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "KL"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
        ]
@@ -345,7 +344,7 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
        is_use_cache_file = False
        is_optimize_model = True
        diff_threshold = 0.025
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold)

@@ -355,7 +354,7 @@ class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
    def test_post_training_avg_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
        ]
@@ -369,7 +368,7 @@ class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
        is_use_cache_file = False
        is_optimize_model = True
        diff_threshold = 0.025
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold)

@@ -379,7 +378,7 @@ class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
    def test_post_training_hist_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "hist"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
        ]
@@ -393,7 +392,7 @@ class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
        is_use_cache_file = False
        is_optimize_model = True
        diff_threshold = 0.03
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold)

@@ -403,7 +402,7 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
    def test_post_training_abs_max_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "abs_max"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
        ]
@@ -417,7 +416,7 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
        is_optimize_model = False
        # The accuracy diff of post-training quantization (abs_max) maybe bigger
        diff_threshold = 0.05
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold)

@@ -427,7 +426,7 @@ class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
    def test_post_training_onnx_format_mobilenetv1(self):
        model = "MobileNet-V1"
        algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
        ]
@@ -444,7 +443,7 @@ class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
        diff_threshold = 0.05
        self.run_test(model,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      data_urls,
                      data_md5s,
                      quantizable_op_type,

--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -25,7 +25,7 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
    def test_post_training_resnet50(self):
        model = "ResNet-50"
        algo = "min_max"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
        ]
@@ -35,7 +35,7 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
        is_use_cache_file = False
        is_optimize_model = False
        diff_threshold = 0.025
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                      quantizable_op_type, is_full_quantize, is_use_cache_file,
                      is_optimize_model, diff_threshold)

@@ -45,7 +45,7 @@ class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
    def test_post_training_resnet50(self):
        model = "ResNet-50"
        algo = "min_max"
-        weight_round_algo = "round"
+        round_type = "round"
        data_urls = [
            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
        ]
@@ -58,7 +58,7 @@ class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
        onnx_format = True
        self.run_test(model,
                      algo,
-                      weight_round_algo,
+                      round_type,
                      data_urls,
                      data_md5s,
                      quantizable_op_type,

--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -49,7 +49,7 @@ class TestFakeQuantizeAbsMaxOp(OpTest):
                               dtype,
                               input_shape,
                               distribution,
-                               round_type='TiesToEven'):
+                               round_type='TiesAwayFromZero'):
        input_data = distribution(input_shape).astype(dtype)
        compute_type = get_compute_type(dtype)
        scale = np.max(np.abs(input_data))
@@ -58,12 +58,12 @@ class TestFakeQuantizeAbsMaxOp(OpTest):
        if round_type == 'TiesToEven':
            round_out = np.round(
                input_data.astype(compute_type) * inv_scale * bnt)
+            output_data = np.clip(round_out, -bnt - 1, bnt)
            self.attrs['round_type'] = 0
        else:
-            round_out = round_c(
+            output_data = round_c(
                input_data.astype(compute_type) * inv_scale * bnt)
            self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt)
        self.inputs = {'X': input_data}
        self.outputs = {'Out': output_data, 'OutScale': scale}
        self.dtype = dtype
@@ -75,7 +75,7 @@ class TestFakeQuantizeAbsMaxOp(OpTest):
    def test_fake_quantize_abs_max_round1(self):
        self._fake_quantize_abs_max(np.float32, (124, 240),
                                    np.random.random,
-                                    round_type='TiesAwayFromZero')
+                                    round_type='TiesToEven')

    def test_fake_quantize_abs_max_float16(self):
        self._fake_quantize_abs_max(np.float16, (124, 240), np.random.random)
@@ -110,12 +110,12 @@ class TestFakeChannelWiseQuantizeAbsMaxOp(OpTest):
        if round_type == 'TiesToEven':
            round_out = np.round(
                input_data.astype(compute_type) / scale_broadcast * bnt)
+            output_data = np.clip(round_out, -bnt - 1, bnt)
            self.attrs['round_type'] = 0
        else:
-            round_out = round_c(
-                input_data.astype(compute_type) / scale_broadcast * bnt)
+            output_data = round_c(bnt * input_data.astype(compute_type) /
+                                  scale_broadcast)
            self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt)
        if quant_axis == 1:
            scale_broadcast = np.transpose(scale_broadcast,
                                           (1, ) + compute_axis)
@@ -169,11 +169,15 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
            round_out = np.round(
                input_data.astype(compute_type) / out_scale[0] * bnt)
            self.attrs['round_type'] = 0
+            output_data = np.clip(round_out, -bnt - 1, bnt)
        else:
-            round_out = round_c(
-                input_data.astype(compute_type) / out_scale[0] * bnt)
+            if is_test:
+                clip_data = np.clip(input_data, -in_scale, in_scale)
+            else:
+                clip_data = input_data
+            output_data = round_c(
+                clip_data.astype(compute_type) / out_scale[0] * bnt)
            self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt)
        self.inputs = {
            'X': input_data,
            'Iter': np.zeros(1).astype(np.int64),
@@ -250,7 +254,7 @@ class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
                                              distribution,
                                              dequantize=False,
                                              with_gradient=False,
-                                              round_type='TiesToEven'):
+                                              round_type='TiesAwayFromZero'):
        input_data = distribution(input_shape).astype(dtype)
        compute_type = get_compute_type(dtype)
        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
@@ -267,12 +271,12 @@ class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
        if round_type == 'TiesToEven':
            round_out = np.round(
                input_data.astype(compute_type) / out_scale * bnt)
+            quant_data = np.clip(round_out, -bnt - 1, bnt)
            self.attrs['round_type'] = 0
        else:
-            round_out = round_c(
+            quant_data = round_c(
                input_data.astype(compute_type) / out_scale * bnt)
            self.attrs['round_type'] = 1
-        quant_data = np.clip(round_out, -bnt - 1, bnt)
        if dequantize:
            output_data = (quant_data * out_scale / bnt).astype(dtype)
            self.op_type = 'fake_quantize_dequantize_moving_average_abs_max'
@@ -307,10 +311,9 @@ class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
                                                   np.random.random)

    def test_fake_quantize_moving_average_abs_max_round1(self):
-        self._fake_quantize_moving_average_abs_max(
-            np.float32, (8, 16, 7, 7),
-            np.random.random,
-            round_type='TiesAwayFromZero')
+        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
+                                                   np.random.random,
+                                                   round_type='TiesToEven')

    def test_fake_quantize_dequantize_moving_average_abs_max(self):
        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
@@ -329,17 +332,17 @@ class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
                                          dtype,
                                          input_shape,
                                          distribution,
-                                          round_type='TiesToEven'):
+                                          round_type='TiesAwayFromZero'):
        input_data = distribution(input_shape).astype(dtype)
        scale = np.max(np.abs(input_data)).astype(dtype)
        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
        if round_type == 'TiesToEven':
            round_out = np.round(input_data / scale * bnt)
+            output_data = np.clip(round_out, -bnt - 1, bnt) * scale / bnt
            self.attrs['round_type'] = 0
        else:
-            round_out = round_c(input_data / scale * bnt)
+            output_data = round_c(input_data / scale * bnt) * scale / bnt
            self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt) * scale / bnt
        self.inputs = {'X': input_data}
        self.outputs = {
            'Out': output_data,
@@ -357,7 +360,7 @@ class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
    def test_fake_quantize_dequantize_abs_max_round1(self):
        self._fake_quantize_dequantize_abs_max(np.float32, (124, 240),
                                               np.random.random,
-                                               round_type='TiesAwayFromZero')
+                                               round_type='TiesToEven')


 class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
@@ -382,11 +385,13 @@ class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
        if round_type == 'TiesToEven':
            round_out = np.round(bnt * output_data / scale_broadcast)
+            output_data = np.clip(round_out, -bnt - 1,
+                                  bnt) * scale_broadcast / bnt
            self.attrs['round_type'] = 0
        else:
-            round_out = round_c(bnt * output_data / scale_broadcast)
+            output_data = round_c(
+                bnt * output_data / scale_broadcast) * scale_broadcast / bnt
            self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt) * scale_broadcast / bnt
        if quant_axis == 1:
            scale_broadcast = np.transpose(scale_broadcast,
                                           (1, ) + compute_axis)