From 491b87b4fc78f2585bc20e5ab86df0f204800b32 Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Fri, 24 Jun 2022 17:27:59 +0800
Subject: [PATCH] fix quantization clip and round Attribute (#43764)

---
 paddle/fluid/operators/fake_quantize_op.cc    | 550 ++++++++++++------
 paddle/fluid/operators/fake_quantize_op.cu.h  | 436 ++++++++------
 paddle/fluid/operators/fake_quantize_op.h     | 317 +++++-----
 paddle/fluid/operators/quantize_linear_op.cc  |  65 ++-
 .../post_training_quantization.py             |  52 +-
 .../slim/quantization/quantization_pass.py    |  90 +--
 .../fluid/contrib/slim/quantization/utils.py  |  46 +-
 ...t_post_training_quantization_lstm_model.py |  16 +-
 .../test_post_training_quantization_mnist.py  |  59 +-
 ..._post_training_quantization_mobilenetv1.py |  33 +-
 ...est_post_training_quantization_resnet50.py |   8 +-
 .../tests/unittests/test_fake_quantize_op.py  |  51 +-
 12 files changed, 997 insertions(+), 726 deletions(-)
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 94badfb1c24..61ee9d49ebe 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -33,8 +33,10 @@ struct Compare {
 
 template <typename T>
 struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
-                  const int num, T* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const T *in,
+                  const int num,
+                  T *out) {
     *out = std::abs(*(std::max_element(in + 0, in + num, Compare<T>())));
   }
 };
@@ -43,24 +45,26 @@ template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in_tensor, const int quant_axis,
-                  T* out_abs_max) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in_tensor,
+                  const int quant_axis,
+                  T *out_abs_max) {
     // At present, channelwise quantization supports conv2d, depthwise_conv2d
     // conv2d_transpose and mul
     PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
         platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                           "the received is %d",
                                           quant_axis));
-    auto* in_data = in_tensor.data<T>();
+    auto *in_data = in_tensor.data<T>();
     auto in_dims = in_tensor.dims();
     const int64_t channel = in_dims[quant_axis];
     if (quant_axis == 0) {
       const int64_t channel_size = in_tensor.numel() / channel;
       for (int64_t i = 0; i < channel; i++) {
-        auto* start = in_data + i * channel_size;
-        auto* end = in_data + (i + 1) * channel_size;
+        auto *start = in_data + i * channel_size;
+        auto *end = in_data + (i + 1) * channel_size;
         out_abs_max[i] =
             std::abs(*(std::max_element(start, end, Compare<T>())));
       }
@@ -72,8 +76,8 @@ struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
       const int64_t step_j = in_tensor.numel() / (in_dims[0] * in_dims[1]);
       for (int64_t i = 0; i < in_dims[0]; i++) {
         for (int64_t j = 0; j < in_dims[1]; j++) {
-          auto* start = in_data + i * step_i + j * step_j;
-          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto *start = in_data + i * step_i + j * step_j;
+          auto *end = in_data + i * step_i + (j + 1) * step_j;
           T abs_max = std::abs(*(std::max_element(start, end, Compare<T>())));
           out_abs_max[j] = std::max(out_abs_max[j], abs_max);
         }
@@ -86,16 +90,30 @@ template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
     T s = scale.data<T>()[0];
     T inv_s = inverse(s);
     platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()),
-          QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
+    if (round_type == 0) {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+    } else {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            phi::ClipFunctor<T>(-s, s));
+      auto out_e = framework::EigenVector<T>::Flatten(*out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+    }
   }
 };
 
@@ -103,19 +121,34 @@ template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
 
 template <typename T>
 struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
     T s = scale.data<T>()[0];
     T inv_s = inverse(s);
 
     platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()),
-          QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(*ctx.eigen_device()) = out_e * s / static_cast<T>(bin_cnt);
+    if (round_type == 0) {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+      auto out_e = framework::EigenVector<T>::Flatten(*out);
+      out_e.device(*ctx.eigen_device()) = out_e * s / static_cast<T>(bin_cnt);
+    } else {
+      trans(ctx,
+            in.data<T>(),
+            in.data<T>() + in.numel(),
+            out->mutable_data<T>(ctx.GetPlace()),
+            phi::ClipFunctor<T>(-s, s));
+      auto out_e = framework::EigenVector<T>::Flatten(*out);
+      out_e.device(*ctx.eigen_device()) =
+          (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+    }
   }
 };
 template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
@@ -123,20 +156,24 @@ template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
 
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
     // At present, channelwise quantization supports conv2d, depthwise_conv2d
     // conv2d_transpose and mul
     PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
         platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                           "the received is %d",
                                           quant_axis));
-    auto* scale_data = scale.data<T>();
-    auto* in_data = in.data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto *scale_data = scale.data<T>();
+    auto *in_data = in.data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
     auto in_dims = in.dims();
     const int64_t channel = in_dims[quant_axis];
     platform::Transform<platform::CPUDeviceContext> trans;
@@ -144,12 +181,31 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
       const int64_t channel_size = in.numel() / channel;
       for (int64_t i = 0; i < channel; i++) {
         T s = scale_data[i];
-        auto* start = in_data + i * channel_size;
-        auto* end = in_data + (i + 1) * channel_size;
+        auto *start = in_data + i * channel_size;
+        auto *end = in_data + (i + 1) * channel_size;
         T inv_s = inverse(s);
-        trans(
-            ctx, start, end, out_data + i * channel_size,
-            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
+        if (round_type == 0) {
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+        } else {
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                phi::ClipFunctor<T>(-s, s));
+        }
+      }
+      if (round_type == 1) {
+        for (int64_t i = 0; i < channel; i++) {
+          T s = scale_data[i];
+          T inv_s = inverse(s);
+          framework::Tensor one_channel_out = out->Slice(i, i + 1);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          out_e.device(*ctx.eigen_device()) = (bin_cnt * inv_s * out_e).round();
+        }
       }
     } else if (quant_axis == 1) {
       const int64_t step_i = in.numel() / in_dims[0];
@@ -158,12 +214,21 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
         for (int j = 0; j < in_dims[1]; j++) {
           T s = scale_data[j];
           T inv_s = inverse(s);
-          auto* start = in_data + i * step_i + j * step_j;
-          auto* end = in_data + i * step_i + (j + 1) * step_j;
-          auto* cur_out_data = out_data + i * step_i + j * step_j;
-          trans(ctx, start, end, cur_out_data,
-                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type,
-                                      inv_s));
+          auto *start = in_data + i * step_i + j * step_j;
+          auto *end = in_data + i * step_i + (j + 1) * step_j;
+          auto *cur_out_data = out_data + i * step_i + j * step_j;
+          if (round_type == 0) {
+            trans(ctx,
+                  start,
+                  end,
+                  cur_out_data,
+                  QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+          } else {
+            trans(ctx, start, end, cur_out_data, phi::ClipFunctor<T>(-s, s));
+            for (int k = 0; k < step_j; k++) {
+              cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]);
+            }
+          }
         }
       }
     }
@@ -174,19 +239,23 @@ template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
                                                float>;
 template <typename T>
 struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
     PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
         platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                           "the received is %d",
                                           quant_axis));
 
-    auto* scale_data = scale.data<T>();
-    auto* in_data = in.data<T>();
-    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto *scale_data = scale.data<T>();
+    auto *in_data = in.data<T>();
+    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
     auto in_dims = in.dims();
     const int64_t channel = in_dims[quant_axis];
     platform::Transform<platform::CPUDeviceContext> trans;
@@ -194,15 +263,35 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
       const int64_t channel_size = in.numel() / channel;
       for (int i = 0; i < channel; i++) {
         T s = scale_data[i];
-        auto* start = in_data + i * channel_size;
-        auto* end = in_data + (i + 1) * channel_size;
-        T inv_s = inverse(s);
-        trans(
-            ctx, start, end, out_data + i * channel_size,
-            QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type, inv_s));
+        auto *start = in_data + i * channel_size;
+        auto *end = in_data + (i + 1) * channel_size;
+        if (round_type == 0) {
+          T inv_s = inverse(s);
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+        } else {
+          trans(ctx,
+                start,
+                end,
+                out_data + i * channel_size,
+                phi::ClipFunctor<T>(-s, s));
+        }
+      }
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
         framework::Tensor one_channel_out = out->Slice(i, i + 1);
         auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        out_e.device(*ctx.eigen_device()) = out_e * s / static_cast<T>(bin_cnt);
+        if (round_type == 0) {
+          out_e.device(*ctx.eigen_device()) =
+              out_e * s / static_cast<T>(bin_cnt);
+        } else {
+          T inv_s = inverse(s);
+          out_e.device(*ctx.eigen_device()) =
+              (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+        }
       }
     } else if (quant_axis == 1) {
       const int64_t step_i = in.numel() / in_dims[0];
@@ -211,14 +300,25 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
         for (int j = 0; j < in_dims[1]; j++) {
           T s = scale_data[j];
           T inv_s = inverse(s);
-          auto* start = in_data + i * step_i + j * step_j;
-          auto* end = in_data + i * step_i + (j + 1) * step_j;
-          auto* cur_out_data = out_data + i * step_i + j * step_j;
-          trans(ctx, start, end, cur_out_data,
-                QuantTensorFunctor<T>(static_cast<T>(bin_cnt), round_type,
-                                      inv_s));
+          auto *start = in_data + i * step_i + j * step_j;
+          auto *end = in_data + i * step_i + (j + 1) * step_j;
+          auto *cur_out_data = out_data + i * step_i + j * step_j;
+          if (round_type == 0) {
+            trans(ctx,
+                  start,
+                  end,
+                  cur_out_data,
+                  QuantTensorFunctor<T>(static_cast<T>(bin_cnt), inv_s));
+          } else {
+            trans(ctx, start, end, cur_out_data, phi::ClipFunctor<T>(-s, s));
+          }
           for (int k = 0; k < step_j; k++) {
-            cur_out_data[k] = cur_out_data[k] * s / static_cast<T>(bin_cnt);
+            if (round_type == 0) {
+              cur_out_data[k] = cur_out_data[k] * s / static_cast<T>(bin_cnt);
+            } else {
+              cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
+                                s / static_cast<T>(bin_cnt);
+            }
           }
         }
       }
@@ -230,12 +330,14 @@ template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
                                                    float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &cur_scale,
+                  const framework::Tensor &last_scale,
+                  const framework::Tensor &iter,
+                  const int window_size,
+                  framework::Tensor *scales_arr,
+                  framework::Tensor *out_scale) {
+    T *scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
     int64_t it = iter.data<int64_t>()[0];
     int idx = it % window_size;
     T removed = scale_arr[idx];
@@ -247,8 +349,8 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
       max = cur;
     } else if (fabs(removed - max) < 1e-6) {
       int size = (it > window_size) ? window_size : it;
-      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
-                                                         &max);
+      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(
+          ctx, scale_arr, size, &max);
     }
     out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
   }
@@ -258,11 +360,14 @@ template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
 template <typename T>
 struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+  void operator()(const platform::CPUDeviceContext &ctx,
+                  const framework::Tensor &in_accum,
+                  const framework::Tensor &in_state,
+                  const T *cur_scale,
+                  const float rate,
+                  framework::Tensor *out_state,
+                  framework::Tensor *out_accum,
+                  framework::Tensor *out_scale) {
     T accum = in_accum.data<T>()[0];
     T state = in_state.data<T>()[0];
     T scale = cur_scale[0];
@@ -282,18 +387,22 @@ template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
 
 class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {
  public:
-  FakeQuantOrWithDequantAbsMaxOp(const std::string& type,
-                                 const framework::VariableNameMap& inputs,
-                                 const framework::VariableNameMap& outputs,
-                                 const framework::AttributeMap& attrs)
+  FakeQuantOrWithDequantAbsMaxOp(const std::string &type,
+                                 const framework::VariableNameMap &inputs,
+                                 const framework::VariableNameMap &outputs,
+                                 const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "FakeQuantOrWithDequantAbsMaxOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(
+        ctx->HasInput("X"), "Input", "X", "FakeQuantOrWithDequantAbsMaxOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                    "FakeQuantOrWithDequantAbsMaxOp");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                    "FakeQuantOrWithDequantAbsMaxOp");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->SetOutputDim("OutScale", {1});
@@ -302,7 +411,7 @@ class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"),
         ctx.device_context());
@@ -320,8 +429,9 @@ class FakeQuantOrWithDequantAbsMaxOpMaker
     AddOutput("OutScale", "(Tensor) Current scale");
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
@@ -329,18 +439,22 @@ class FakeQuantOrWithDequantAbsMaxOpMaker
         });
     AddAttr<int>(
         "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
         "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
         "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
         "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
     AddComment(R"DOC(
 This is a Base Op which supports FakeQuantAbsMaxOpMaker and FakeQuantDequantAbsMaxOpMaker.
 FakeQuantAbsMaxOp operator is used in the dynamic quantization.
@@ -363,12 +477,16 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "FakeChannelWiseQuantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(
+        ctx->HasInput("X"), "Input", "X", "FakeChannelWiseQuantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                    "FakeChannelWiseQuantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                    "FakeChannelWiseQuantizeAbsMax");
     int quant_axis = ctx->Attrs().Get<int>("quant_axis");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
@@ -378,7 +496,7 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
   }
@@ -398,8 +516,9 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                  "For conv2d, depthwise_conv2d, conv2d_transpose "
                  "and mul, the quant_axis is equal to the cout axis.")
         .SetDefault(0)
-        .AddCustomChecker([](const int& quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+        .AddCustomChecker([](const int &quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'quant_axis' should be 0 or 1, but "
                                 "the received is %d",
@@ -407,8 +526,9 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
         });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
@@ -416,18 +536,22 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
         });
     AddAttr<int>(
         "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
         "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
         "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
         "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
@@ -450,12 +574,18 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOp
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"),
+                   "Input",
+                   "X",
                    "FakeChannelWiseQuantizeDequantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                    "FakeChannelWiseQuantizeDequantizeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                    "FakeChannelWiseQuantizeDequantizeAbsMax");
     int quant_axis = ctx->Attrs().Get<int>("quant_axis");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
@@ -465,7 +595,7 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOp
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
   }
@@ -485,8 +615,9 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
                  "For conv2d, depthwise_conv2d, conv2d_transpose "
                  "and mul, the quant_axis is equal to the cout axis.")
         .SetDefault(0)
-        .AddCustomChecker([](const int& quant_axis) {
-          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+        .AddCustomChecker([](const int &quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'quant_axis' should be 0 or 1, but "
                                 "the received is %d",
@@ -494,8 +625,9 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
         });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
@@ -503,18 +635,22 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
         });
     AddAttr<int>(
         "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
         "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
         "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
         "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
     AddComment(R"DOC(
 The scale of FakeChannelWiseQuantize operator is a vector.
 In detail, each channel of the input X has a scale value.
@@ -530,17 +666,19 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
 
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  public:
-  FakeQuantizeRangeAbsMaxOp(const std::string& type,
-                            const framework::VariableNameMap& inputs,
-                            const framework::VariableNameMap& outputs,
-                            const framework::AttributeMap& attrs)
+  FakeQuantizeRangeAbsMaxOp(const std::string &type,
+                            const framework::VariableNameMap &inputs,
+                            const framework::VariableNameMap &outputs,
+                            const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FakeQuantizeRangeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "FakeQuantizeRangeAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Out"), "Output", "Out", "FakeQuantizeRangeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                    "FakeQuantizeRangeAbsMax");
     if (ctx->HasOutput("OutScales")) {
       int window_size = ctx->Attrs().Get<int>("window_size");
@@ -553,7 +691,7 @@ class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"),
         ctx.device_context());
@@ -574,8 +712,9 @@ class FakeQuantizeRangeAbsMaxOpMaker
         .SetDefault(10000);
     AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
         .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
@@ -583,18 +722,22 @@ class FakeQuantizeRangeAbsMaxOpMaker
         });
     AddAttr<int>(
         "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
         "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
         "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
         "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
@@ -614,17 +757,24 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOp
     : public framework::OperatorWithKernel {
  public:
   FakeQuantOrWithDequantMovingAverageAbsMaxOp(
-      const std::string& type, const framework::VariableNameMap& inputs,
-      const framework::VariableNameMap& outputs,
-      const framework::AttributeMap& attrs)
+      const std::string &type,
+      const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"),
+                   "Input",
+                   "X",
                    "FakeQuantOrWithDequantMovingAverageAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+    OP_INOUT_CHECK(ctx->HasOutput("Out"),
+                   "Output",
+                   "Out",
                    "FakeQuantOrWithDequantMovingAverageAbsMax");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                    "FakeQuantOrWithDequantMovingAverageAbsMax");
     if (ctx->HasOutput("OutState")) {
       ctx->SetOutputDim("OutState", {1});
@@ -639,7 +789,7 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOp
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"),
         ctx.device_context());
@@ -662,8 +812,9 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
         .SetDefault(0.9);
     AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
         .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
@@ -671,18 +822,22 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
         });
     AddAttr<int>(
         "round_type",
-        "(int, default 0) The round type of fp32 to int."
+        "(int, default 1) The round type of fp32 to int."
         "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
         "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
         "round(2.5)=3")
-        .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .SetDefault(1)
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
@@ -709,10 +864,12 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
-                   "MovingAverageAbsMaxScale");
-    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(
+        ctx->HasInput("X"), "Input", "X", "MovingAverageAbsMaxScale");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"),
+                   "Output",
+                   "OutScale",
                    "MovingAverageAbsMaxScale");
 
     if (ctx->HasOutput("OutState")) {
@@ -730,7 +887,7 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
   }
@@ -770,19 +927,23 @@ class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     auto out_grad_name = framework::GradVarName("Out");
     auto x_grad_name = framework::GradVarName("X");
-    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name),
+                   "Input",
+                   out_grad_name,
                    "StrightThroughEstimatorGradOp");
-    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name),
+                   "Output",
+                   x_grad_name,
                    "StrightThroughEstimatorGradOp");
 
     ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
@@ -810,7 +971,8 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(
-    fake_quantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    fake_quantize_abs_max,
+    ops::FakeQuantOrWithDequantAbsMaxOp,
     ops::FakeQuantOrWithDequantAbsMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
@@ -818,7 +980,8 @@ REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                        ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
 REGISTER_OPERATOR(
-    fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    fake_quantize_dequantize_abs_max,
+    ops::FakeQuantOrWithDequantAbsMaxOp,
     ops::FakeQuantOrWithDequantAbsMaxOpMaker,
     ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
     ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
@@ -826,7 +989,8 @@ REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
                        ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);
 
 REGISTER_OPERATOR(
-    fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
+    fake_quantize_range_abs_max,
+    ops::FakeQuantizeRangeAbsMaxOp,
     ops::FakeQuantizeRangeAbsMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
@@ -853,7 +1017,8 @@ REGISTER_OP_CPU_KERNEL(
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
 
 REGISTER_OPERATOR(
-    fake_channel_wise_quantize_abs_max, ops::FakeChannelWiseQuantizeAbsMaxOp,
+    fake_channel_wise_quantize_abs_max,
+    ops::FakeChannelWiseQuantizeAbsMaxOp,
     ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
@@ -861,7 +1026,8 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
                        ops::FakeChannelWiseQuantizeAbsMaxKernel<CPU, float>);
 
 REGISTER_OPERATOR(
-    moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
+    moving_average_abs_max_scale,
+    ops::MovingAverageAbsMaxScaleOp,
     ops::MovingAverageAbsMaxScaleOpMaker,
     ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
     ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 46aa3fbfe3b..3b1877f2bc8 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -36,12 +36,12 @@ struct QuantizeDataType<paddle::platform::float16> {
 };
 
 template <typename T>
-__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
+__global__ void FindAbsMaxKernel(const T *in, const int n, T *out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+  extern __shared__ char *shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T *>(shared_max_data_tmp);
   if (gridDim.x > 1) {
     T local_max_data = T(0);
     for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
@@ -73,14 +73,16 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
 
 template <typename T>
 struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, T* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const T *in,
+                  const int num,
+                  T *out) {
     int block = 1024;
     int grid = (block - 1 + num) / block;
     grid = (grid > block) ? block : grid;
 
     framework::Tensor max;
-    T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
+    T *max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
     FindAbsMaxKernel<T>
         <<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(in, num, max_data);
     FindAbsMaxKernel<T>
@@ -93,13 +95,15 @@ template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
                                   paddle::platform::float16>;
 
 template <typename T>
-__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
-                                                  const int c, T* out) {
+__global__ void FindChannelAbsMaxKernelQuantAxis0(const T *in,
+                                                  const int n,
+                                                  const int c,
+                                                  T *out) {
   int tid = threadIdx.x;
   int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+  const T *in_c = in + blockIdx.x * channel_size;
+  extern __shared__ char *shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T *>(shared_max_data_tmp);
   T local_max_data = T(0);
   for (int i = tid; i < channel_size; i += blockDim.x) {
     T tmp = static_cast<T>(
@@ -122,17 +126,16 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
 }
 
 template <typename T>
-__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
-                                                  const int cin, const int cout,
-                                                  T* out) {
-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+__global__ void FindChannelAbsMaxKernelQuantAxis1(
+    const T *in, const int n, const int cin, const int cout, T *out) {
+  extern __shared__ char *shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T *>(shared_max_data_tmp);
   int cout_wh_size = n / cin;
   int wh_size = n / (cin * cout);
 
   int tid = threadIdx.x;
   int bid = blockIdx.x;
-  const T* in_current = in + tid * cout_wh_size + bid * wh_size;
+  const T *in_current = in + tid * cout_wh_size + bid * wh_size;
   T local_max_data = T(0);
   for (int i = 0; i < wh_size; i++) {
     T tmp = static_cast<T>(
@@ -162,24 +165,26 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
 
 template <typename T>
 struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_tensor, const int quant_axis,
-                  T* out_abs_max) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in_tensor,
+                  const int quant_axis,
+                  T *out_abs_max) {
     PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
         platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                           "the received is %d",
                                           quant_axis));
     const int num = in_tensor.numel();
     auto in_dims = in_tensor.dims();
-    const T* in_data = in_tensor.data<T>();
+    const T *in_data = in_tensor.data<T>();
     if (quant_axis == 0) {
       int cout = in_dims[0];
       int grid = cout;
       int block = 1024;
       FindChannelAbsMaxKernelQuantAxis0<T>
-          <<<grid, block, block * sizeof(T), ctx.stream()>>>(in_data, num, cout,
-                                                             out_abs_max);
+          <<<grid, block, block * sizeof(T), ctx.stream()>>>(
+              in_data, num, cout, out_abs_max);
     } else if (quant_axis == 1) {
       int cin = in_dims[0];
       int cout = in_dims[1];
@@ -213,9 +218,12 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
-__global__ void ClipAndQuantKernel(const T* in, const T* scale,
-                                   const int bin_cnt, const int round_type,
-                                   const int n, T* out) {
+__global__ void ClipAndQuantKernel(const T *in,
+                                   const T *scale,
+                                   const int bin_cnt,
+                                   const int round_type,
+                                   const int n,
+                                   T *out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
@@ -227,25 +235,30 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
 
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
     ComputeDataType x = static_cast<ComputeDataType>(in[i]);
-    x = bin_cnt_t * inv_s * x;
     if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
       x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out[i] = static_cast<T>(x);
     } else {
-      x = round(x);
+      ComputeDataType v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt_t * inv_s * v;
+      out[i] = static_cast<T>(round(v));
     }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out[i] = static_cast<T>(x);
   }
 }
 
 template <typename T>
-__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
+__global__ void ClipAndQuantDequantKernel(const T *in,
+                                          const T *scale,
                                           const int bin_cnt,
-                                          const int round_type, const int n,
-                                          T* out) {
+                                          const int round_type,
+                                          const int n,
+                                          T *out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
@@ -257,33 +270,39 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
 
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
     ComputeDataType x = static_cast<ComputeDataType>(in[i]);
-    x = bin_cnt_t * inv_s * x;
     if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
       x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out[i] = static_cast<T>((x * s) / bin_cnt_t);
     } else {
+      x = x > s ? s : x;
+      x = x < -s ? -s : x;
+      x = bin_cnt_t * inv_s * x;
       x = round(x);
+      out[i] = static_cast<T>((x * s) / bin_cnt_t);
     }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out[i] = static_cast<T>((x * s) / bin_cnt_t);
   }
 }
 
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
     int num = in.numel();
     int block = 1024;
     int grid = (block - 1 + num) / block;
 
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
 
     ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
         in_data, scale_data, bin_cnt, round_type, num, out_data);
@@ -294,17 +313,19 @@ template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
 struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out) {
     int num = in.numel();
     int block = 1024;
     int grid = (block - 1 + num) / block;
 
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
 
     ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
         in_data, scale_data, bin_cnt, round_type, num, out_data);
@@ -313,16 +334,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
 
 // ChannelClipAndQuantKernel for quant_axis is 0
 template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
+__global__ void ChannelClipAndQuantKernelQuantAxis0(const T *in,
+                                                    const T *scale,
                                                     const int bin_cnt,
                                                     const int round_type,
                                                     const int64_t n,
-                                                    const int c, T* out) {
+                                                    const int c,
+                                                    T *out) {
   int tid = threadIdx.x;
 
   int64_t channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
+  const T *in_c = in + blockIdx.x * channel_size;
+  T *out_c = out + blockIdx.x * channel_size;
 
   using ComputeDataType = typename QuantizeDataType<T>::type;
 
@@ -332,25 +355,33 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
 
   for (int64_t i = tid; i < channel_size; i += blockDim.x) {
     ComputeDataType x = static_cast<ComputeDataType>(in_c[i]);
-    x = bin_cnt_t * inv_s * x;
     if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
       x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out_c[i] = static_cast<T>(x);
     } else {
-      x = round(x);
+      ComputeDataType v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt_t * inv_s * v;
+      out_c[i] = static_cast<T>(round(v));
     }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out_c[i] = static_cast<T>(x);
   }
 }
 
 // ChannelClipAndQuantKernel for quant_axis is N
 template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxisN(
-    const T* in, const T* scale, const int bin_cnt, const int round_type,
-    const int64_t n, const int nScale, const int quant_stride, T* out) {
+__global__ void ChannelClipAndQuantKernelQuantAxisN(const T *in,
+                                                    const T *scale,
+                                                    const int bin_cnt,
+                                                    const int round_type,
+                                                    const int64_t n,
+                                                    const int nScale,
+                                                    const int quant_stride,
+                                                    T *out) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   using ComputeDataType = typename QuantizeDataType<T>::type;
   ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
@@ -359,37 +390,44 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
         static_cast<ComputeDataType>(scale[(i / quant_stride) % nScale]);
     ComputeDataType inv_s = inverse(s);
     ComputeDataType x = static_cast<ComputeDataType>(in[i]);
-    x = bin_cnt_t * inv_s * x;
     if (round_type == 0) {
+      x = bin_cnt_t * inv_s * x;
       x = roundWithTiesToEven(x);
+      ComputeDataType max_bound = bin_cnt_t;
+      ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out[i] = static_cast<T>(x);
     } else {
-      x = round(x);
+      ComputeDataType v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt_t * inv_s * v;
+      out[i] = static_cast<T>(round(v));
     }
-    ComputeDataType max_bound = bin_cnt_t;
-    ComputeDataType min_bound = -bin_cnt_t - static_cast<ComputeDataType>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out[i] = static_cast<T>(x);
   }
 }
 
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
     PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
         platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                           "the received is %d",
                                           quant_axis));
 
     int64_t num = in.numel();
     auto in_dims = in.dims();
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
 
     if (quant_axis == 0) {
       int grid = in_dims[0];
@@ -411,9 +449,15 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
       const int64_t grid_size =
           std::min(max_blocks, (num + block_size - 1) / block_size);
 
-      ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
-          in_data, scale_data, bin_cnt, round_type, num, in_dims[quant_axis],
-          quant_stride, out_data);
+      ChannelClipAndQuantKernelQuantAxisN<T>
+          <<<grid_size, block_size>>>(in_data,
+                                      scale_data,
+                                      bin_cnt,
+                                      round_type,
+                                      num,
+                                      in_dims[quant_axis],
+                                      quant_stride,
+                                      out_data);
     }
   }
 };
@@ -422,12 +466,14 @@ template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
                                                float>;
 
 template <typename T>
-__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
-                                            const T* last_scale,
-                                            const int64_t* iter,
-                                            const int window_size, T* scale_arr,
-                                            T* out_scale, int* need_find_max,
-                                            int* out_size) {
+__global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale,
+                                            const T *last_scale,
+                                            const int64_t *iter,
+                                            const int window_size,
+                                            T *scale_arr,
+                                            T *out_scale,
+                                            int *need_find_max,
+                                            int *out_size) {
   int it = iter[0];
   int idx = it % window_size;
   T removed = scale_arr[idx];
@@ -446,45 +492,63 @@ __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
 
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &cur_scale,
+                  const framework::Tensor &last_scale,
+                  const framework::Tensor &iter,
+                  const int window_size,
+                  framework::Tensor *scales_arr,
+                  framework::Tensor *out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
-    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+    T *scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T *out_scale_data = out_scale->mutable_data<T>(gpu_place);
 
     framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
-    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
-
-    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
-        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
-        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+    int *find_max = need_find_max.mutable_data<int>({1}, gpu_place);
+    int *out_size_data = out_size.mutable_data<int>({1}, gpu_place);
+
+    FindRangeAbsMaxAndFillArray<T>
+        <<<1, 1, 0, ctx.stream()>>>(cur_scale.data<T>(),
+                                    last_scale.data<T>(),
+                                    iter.data<int64_t>(),
+                                    window_size,
+                                    scale_arr,
+                                    out_scale_data,
+                                    find_max,
+                                    out_size_data);
 
     int g_find_max;
-    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
-                 sizeof(int), ctx.stream());
+    memory::Copy(platform::CPUPlace(),
+                 &g_find_max,
+                 gpu_place,
+                 find_max,
+                 sizeof(int),
+                 ctx.stream());
     ctx.Wait();
     if (g_find_max) {
       int len;
-      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
-                   sizeof(int), ctx.stream());
+      memory::Copy(platform::CPUPlace(),
+                   &len,
+                   gpu_place,
+                   out_size_data,
+                   sizeof(int),
+                   ctx.stream());
       ctx.Wait();
-      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
-                                                          out_scale_data);
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(
+          ctx, scale_arr, len, out_scale_data);
     }
   }
 };
 
 template <typename T>
-__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
-                                              const T* in_accum,
-                                              const T* cur_scale, const T rate,
-                                              T* out_state, T* out_accum,
-                                              T* out_scale) {
+__global__ void FindMovingAverageAbsMaxKernel(const T *in_state,
+                                              const T *in_accum,
+                                              const T *cur_scale,
+                                              const T rate,
+                                              T *out_state,
+                                              T *out_accum,
+                                              T *out_scale) {
   T state = rate * (*in_state) + T(1.0f);
   T accum = rate * (*in_accum) + (*cur_scale);
   *out_state = state;
@@ -496,92 +560,119 @@ template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
 struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in_accum,
+                  const framework::Tensor &in_state,
+                  const T *cur_scale,
+                  const float rate,
+                  framework::Tensor *out_state,
+                  framework::Tensor *out_accum,
+                  framework::Tensor *out_scale) {
     const auto gpu_place = ctx.GetPlace();
 
     T rate_t = static_cast<T>(rate);
-    T* out_state_data = out_state->mutable_data<T>(gpu_place);
-    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
-
-    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
-        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
-        out_state_data, out_accum_data, out_scale_data);
+    T *out_state_data = out_state->mutable_data<T>(gpu_place);
+    T *out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T *out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    FindMovingAverageAbsMaxKernel<T>
+        <<<1, 1, 0, ctx.stream()>>>(in_state.data<T>(),
+                                    in_accum.data<T>(),
+                                    cur_scale,
+                                    rate_t,
+                                    out_state_data,
+                                    out_accum_data,
+                                    out_scale_data);
   }
 };
 
 // ChannelClipAndQuantDequantKernel for quant_axis is 0
 template <typename T>
-__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
-    const T* in, const T* scale, const int bin_cnt, const int round_type,
-    const int n, const int c, T* out) {
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(const T *in,
+                                                           const T *scale,
+                                                           const int bin_cnt,
+                                                           const int round_type,
+                                                           const int n,
+                                                           const int c,
+                                                           T *out) {
   int tid = threadIdx.x;
 
   int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
+  const T *in_c = in + blockIdx.x * channel_size;
+  T *out_c = out + blockIdx.x * channel_size;
 
   T s = scale[blockIdx.x];
   T inv_s = inverse(s);
 
   for (int i = tid; i < channel_size; i += blockDim.x) {
     T x = in_c[i];
-    x = bin_cnt * inv_s * x;
     if (round_type == 0) {
+      x = bin_cnt * inv_s * x;
       x = roundWithTiesToEven(x);
+      T max_bound = bin_cnt;
+      T min_bound = -bin_cnt - static_cast<T>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out_c[i] = (x * s) / bin_cnt;
     } else {
-      x = round(x);
+      T v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt * inv_s * v;
+      out_c[i] = round(v) * s / bin_cnt;
     }
-    T max_bound = bin_cnt;
-    T min_bound = -bin_cnt - static_cast<T>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out_c[i] = (x * s) / bin_cnt;
   }
 }
 
 // ChannelClipAndQuantDequantKernel for quant_axis is 1
 template <typename T>
-__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
-    const T* in, const T* scale, const int bin_cnt, const int round_type,
-    const int n, const int cin, const int cout, T* out) {
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
+                                                           const T *scale,
+                                                           const int bin_cnt,
+                                                           const int round_type,
+                                                           const int n,
+                                                           const int cin,
+                                                           const int cout,
+                                                           T *out) {
   T s = scale[blockIdx.x % cout];
   T inv_s = inverse(s);
 
   int wh_size = n / (cin * cout);
-  const T* in_c = in + blockIdx.x * wh_size;
-  T* out_c = out + blockIdx.x * wh_size;
+  const T *in_c = in + blockIdx.x * wh_size;
+  T *out_c = out + blockIdx.x * wh_size;
 
   for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
     T x = in_c[i];
-    x = bin_cnt * inv_s * x;
     if (round_type == 0) {
+      x = bin_cnt * inv_s * x;
       x = roundWithTiesToEven(x);
+      T max_bound = bin_cnt;
+      T min_bound = -bin_cnt - static_cast<T>(1);
+      x = x > max_bound ? max_bound : x;
+      x = x < min_bound ? min_bound : x;
+      out_c[i] = (x * s) / bin_cnt;
     } else {
-      x = round(x);
+      T v = x > s ? s : x;
+      v = v < -s ? -s : v;
+      v = bin_cnt * inv_s * v;
+      out_c[i] = round(v) * s / bin_cnt;
     }
-    T max_bound = bin_cnt;
-    T min_bound = -bin_cnt - static_cast<T>(1);
-    x = x > max_bound ? max_bound : x;
-    x = x < min_bound ? min_bound : x;
-    out_c[i] = (x * s) / bin_cnt;
   }
 }
 
 template <typename T>
 struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int round_type, const int quant_axis,
-                  framework::Tensor* out) {
+  void operator()(const platform::CUDADeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out) {
     // At present, channelwise quantization supports conv2d, depthwise_conv2d
     // conv2d_transpose and mul
     PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
+        quant_axis == 0 || quant_axis == 1,
+        true,
         platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
                                           "the received is %d",
                                           quant_axis));
@@ -589,25 +680,34 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
     int num = in.numel();
     auto in_dims = in.dims();
 
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const T *in_data = in.data<T>();
+    const T *scale_data = scale.data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
 
     if (quant_axis == 0) {
       int grid = in_dims[0];
       int block = 1024;
       ChannelClipAndQuantDequantKernelQuantAxis0<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                             round_type, num, in_dims[0],
+          <<<grid, block, 0, ctx.stream()>>>(in_data,
+                                             scale_data,
+                                             bin_cnt,
+                                             round_type,
+                                             num,
+                                             in_dims[0],
                                              out_data);
     } else if (quant_axis == 1) {
       int grid = in_dims[0] * in_dims[1];
       int block = 1024;
 
       ChannelClipAndQuantDequantKernelQuantAxis1<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                             round_type, num, in_dims[0],
-                                             in_dims[1], out_data);
+          <<<grid, block, 0, ctx.stream()>>>(in_data,
+                                             scale_data,
+                                             bin_cnt,
+                                             round_type,
+                                             num,
+                                             in_dims[0],
+                                             in_dims[1],
+                                             out_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 2956478f44a..6931ac4325b 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -51,16 +51,11 @@ inline HOSTDEVICE T roundWithTiesToEven(T x) {
 template <typename T>
 class QuantTensorFunctor {
  public:
-  explicit QuantTensorFunctor(const T bin_cnt, const int round_type,
-                              const T inv_s)
-      : bin_cnt_(bin_cnt), round_type_(round_type), inv_s_(inv_s) {}
+  explicit QuantTensorFunctor(const T bin_cnt, const T inv_s)
+      : bin_cnt_(bin_cnt), inv_s_(inv_s) {}
   HOSTDEVICE T operator()(const T x) const {
     T out = bin_cnt_ * inv_s_ * x;
-    if (round_type_ == 0) {
-      out = roundWithTiesToEven(out);
-    } else if (round_type_ == 1) {
-      out = std::round(out);
-    }
+    out = roundWithTiesToEven(out);
     T max_bound = bin_cnt_;
     T min_bound = -bin_cnt_ - static_cast<T>(1);
     out = out > max_bound ? max_bound : out;
@@ -70,82 +65,101 @@ class QuantTensorFunctor {
 
  private:
   T bin_cnt_;
-  int round_type_;
   T inv_s_;
 };
 
 template <typename DeviceContext, typename T>
 struct FindAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
+  void operator()(const DeviceContext &ctx, const T *in, const int num, T *out);
 };
 
 template <typename DeviceContext, typename T>
 struct ClipAndFakeQuantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  const int round_type, framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  framework::Tensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct ClipAndFakeQuantDequantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  int round_type, framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  int round_type,
+                  framework::Tensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct FindRangeAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &cur_scale,
+                  const framework::Tensor &last_scale,
+                  const framework::Tensor &iter,
+                  const int window_size,
+                  framework::Tensor *scales_arr,
+                  framework::Tensor *out_scale);
 };
 
 template <typename DeviceContext, typename T>
 struct FindChannelAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in_tensor,
-                  const int quant_axis, T* out_abs_max);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in_tensor,
+                  const int quant_axis,
+                  T *out_abs_max);
 };
 
 template <typename DeviceContext, typename T>
 struct ChannelClipAndFakeQuantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  const int round_type, const int quant_axis,
-                  framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  const int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct ChannelClipFakeQuantDequantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  int round_type, const int quant_axis, framework::Tensor* out);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in,
+                  const framework::Tensor &scale,
+                  const int bin_cnt,
+                  int round_type,
+                  const int quant_axis,
+                  framework::Tensor *out);
 };
 
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state,
-                  const framework::Tensor& cur_scale,
-                  framework::Tensor* out_state, framework::Tensor* out_accum,
-                  framework::Tensor* out_scale);
+  void operator()(const DeviceContext &ctx,
+                  const framework::Tensor &in_accum,
+                  const framework::Tensor &in_state,
+                  const framework::Tensor &cur_scale,
+                  framework::Tensor *out_state,
+                  framework::Tensor *out_accum,
+                  framework::Tensor *out_scale);
 };
 
 template <typename DeviceContext, typename T>
 class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    T *out_s = out_scale->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
     int round_type = context.Attr<int>("round_type");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    const T* in_data = in->data<T>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    const T *in_data = in->data<T>();
     FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
     RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
   }
@@ -153,20 +167,25 @@ class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
   virtual ~FakeAbsMaxKernelBase() = default;
 
  protected:
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& scale, int bin_cnt,
-                              int round_type, framework::Tensor* out) const = 0;
+  virtual void RunClipFunctor(const DeviceContext &dev_ctx,
+                              const framework::Tensor &in,
+                              const framework::Tensor &scale,
+                              int bin_cnt,
+                              int round_type,
+                              framework::Tensor *out) const = 0;
 };
 
 template <typename DeviceContext, typename T>
 class FakeQuantizeAbsMaxKernel : public FakeAbsMaxKernelBase<DeviceContext, T> {
  protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, scale, bin_cnt,
-                                                round_type, out);
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, in, scale, bin_cnt, round_type, out);
   }
 };
 
@@ -174,9 +193,12 @@ template <typename DeviceContext, typename T>
 class FakeQuantizeDequantizeAbsMaxKernel
     : public FakeAbsMaxKernelBase<DeviceContext, T> {
  protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
     ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(
         dev_ctx, in, scale, bin_cnt, round_type, out);
   }
@@ -185,11 +207,11 @@ class FakeQuantizeDequantizeAbsMaxKernel
 template <typename DeviceContext, typename T>
 class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
 
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
     out->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
@@ -198,11 +220,11 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
     int quant_axis = context.Attr<int>("quant_axis");
     bool is_test = context.Attr<bool>("is_test");
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();
     if (!is_test) {
-      T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
-      FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                   out_scale_data);
+      T *out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+      FindChannelAbsMaxFunctor<DeviceContext, T>()(
+          dev_ctx, *in, quant_axis, out_scale_data);
     }
     ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
         dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
@@ -213,12 +235,12 @@ template <typename DeviceContext, typename T>
 class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
     : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    T *out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    auto &dev_ctx = context.template device_context<DeviceContext>();
     out->mutable_data<T>(dev_ctx.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
@@ -226,8 +248,8 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
     int quant_axis = context.Attr<int>("quant_axis");
 
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                 out_scale_data);
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, *in, quant_axis, out_scale_data);
 
     ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
         dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
@@ -237,60 +259,64 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("InScale");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *in_scale = context.Input<framework::Tensor>("InScale");
 
-    auto* out = context.Output<framework::Tensor>("Out");
+    auto *out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     bool is_test = context.Attr<bool>("is_test");
     int bit_length = context.Attr<int>("bit_length");
     int round_type = context.Attr<int>("round_type");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();
 
     // testing
     if (is_test) {
-      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
-                                                  bin_cnt, round_type, out);
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(
+          dev_ctx, *in, *in_scale, bin_cnt, round_type, out);
       return;
     }
 
     // training
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    auto* out_scales = context.Output<framework::Tensor>("OutScales");
-    auto* iter = context.Input<framework::Tensor>("Iter");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_scales = context.Output<framework::Tensor>("OutScales");
+    auto *iter = context.Input<framework::Tensor>("Iter");
 
     int window_size = context.Attr<int>("window_size");
     out_scale->mutable_data<T>(context.GetPlace());
 
     framework::Tensor cur_scale;
-    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
-    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
-                                               *iter, window_size, out_scales,
+    T *cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
+    FindAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
+    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                               cur_scale,
+                                               *in_scale,
+                                               *iter,
+                                               window_size,
+                                               out_scales,
                                                out_scale);
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
-                                                bin_cnt, round_type, out);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
   }
 };
 
 template <typename DeviceContext, typename T>
 class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto* in_scale = context.Input<framework::Tensor>("InScale");
-    auto* out = context.Output<framework::Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto *in_scale = context.Input<framework::Tensor>("InScale");
+    auto *out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     bool is_test = context.Attr<bool>("is_test");
     int bit_length = context.Attr<int>("bit_length");
     int round_type = context.Attr<int>("round_type");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto &dev_ctx = context.template device_context<DeviceContext>();
 
     // testing
     if (is_test) {
@@ -299,25 +325,30 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
     }
 
     // training
-    auto* in_accum = context.Input<framework::Tensor>("InAccum");
-    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto *in_accum = context.Input<framework::Tensor>("InAccum");
+    auto *in_state = context.Input<framework::Tensor>("InState");
     auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+    T *cur_scale_data = static_cast<T *>(cur_scale->ptr());
 
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
+    FindAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
 
-    auto* out_state = context.Output<framework::Tensor>("OutState");
-    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_state = context.Output<framework::Tensor>("OutState");
+    auto *out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
     out_state->mutable_data<T>(context.GetPlace());
     out_accum->mutable_data<T>(context.GetPlace());
     out_scale->mutable_data<T>(context.GetPlace());
     float moving_rate = context.Attr<float>("moving_rate");
 
-    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
-        out_accum, out_scale);
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                                       *in_accum,
+                                                       *in_state,
+                                                       cur_scale_data,
+                                                       moving_rate,
+                                                       out_state,
+                                                       out_accum,
+                                                       out_scale);
 
     RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
   }
@@ -325,21 +356,26 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
   virtual ~FakeMovingAverageAbsMaxKernelBase() = default;
 
  protected:
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& in_scale, int bin_cnt,
-                              int round_type, framework::Tensor* out) const = 0;
+  virtual void RunClipFunctor(const DeviceContext &dev_ctx,
+                              const framework::Tensor &in,
+                              const framework::Tensor &in_scale,
+                              int bin_cnt,
+                              int round_type,
+                              framework::Tensor *out) const = 0;
 };
 
 template <typename DeviceContext, typename T>
 class FakeQuantizeMovingAverageAbsMaxKernel
     : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
  protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, in_scale, bin_cnt,
-                                                round_type, out);
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &in_scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, in, in_scale, bin_cnt, round_type, out);
   }
 };
 
@@ -347,9 +383,12 @@ template <typename DeviceContext, typename T>
 class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
     : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
  protected:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      int round_type, framework::Tensor* out) const override {
+  void RunClipFunctor(const DeviceContext &dev_ctx,
+                      const framework::Tensor &in,
+                      const framework::Tensor &in_scale,
+                      int bin_cnt,
+                      int round_type,
+                      framework::Tensor *out) const override {
     ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(
         dev_ctx, in, in_scale, bin_cnt, round_type, out);
   }
@@ -358,12 +397,12 @@ class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
 template <typename DeviceContext, typename T>
 class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<framework::Tensor>("X");
-    auto& dev_ctx = context.template device_context<DeviceContext>();
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::Tensor>("X");
+    auto &dev_ctx = context.template device_context<DeviceContext>();
 
     if (context.HasOutput("Out")) {
-      auto* out = context.Output<framework::Tensor>("Out");
+      auto *out = context.Output<framework::Tensor>("Out");
       out->mutable_data<T>(context.GetPlace());
       framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
     }
@@ -375,40 +414,46 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     }
 
     // training
-    auto* in_accum = context.Input<framework::Tensor>("InAccum");
-    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto *in_accum = context.Input<framework::Tensor>("InAccum");
+    auto *in_state = context.Input<framework::Tensor>("InState");
     auto cur_scale = memory::Alloc(dev_ctx, sizeof(T));
-    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+    T *cur_scale_data = static_cast<T *>(cur_scale->ptr());
 
-    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
-                                          cur_scale_data);
+    FindAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), cur_scale_data);
 
-    auto* out_state = context.Output<framework::Tensor>("OutState");
-    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
-    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto *out_state = context.Output<framework::Tensor>("OutState");
+    auto *out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto *out_scale = context.Output<framework::Tensor>("OutScale");
     out_state->mutable_data<T>(context.GetPlace());
     out_accum->mutable_data<T>(context.GetPlace());
     out_scale->mutable_data<T>(context.GetPlace());
     float moving_rate = context.Attr<float>("moving_rate");
 
-    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
-        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
-        out_accum, out_scale);
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(dev_ctx,
+                                                       *in_accum,
+                                                       *in_state,
+                                                       cur_scale_data,
+                                                       moving_rate,
+                                                       out_state,
+                                                       out_accum,
+                                                       out_scale);
   }
 };
 
 template <typename DeviceContext, typename T>
 class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto x_grad_name = framework::GradVarName("X");
-    auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet(
-                                     "StrightThroughEstimatorGradKernel "
-                                     "doesn't have the output named %s.",
-                                     x_grad_name));
+    auto *d_x = context.Output<framework::LoDTensor>(x_grad_name);
+    PADDLE_ENFORCE_NOT_NULL(d_x,
+                            platform::errors::PreconditionNotMet(
+                                "StrightThroughEstimatorGradKernel "
+                                "doesn't have the output named %s.",
+                                x_grad_name));
 
     // Initialize dx as same as d_out
     d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index 7aaebb8f929..4580acbe3fc 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -26,14 +26,17 @@ namespace operators {
 
 template <typename T>
 struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor* scale,
-                  T max_range, const int quant_axis, framework::Tensor* out) {
+  void operator()(const platform::CPUDeviceContext &dev_ctx,
+                  const framework::Tensor *in,
+                  const framework::Tensor *scale,
+                  T max_range,
+                  const int quant_axis,
+                  framework::Tensor *out) {
     // Dequant op is before quantized op
     // Dequantize the weight of quantized op
     auto in_dims = in->dims();
     const int64_t channel = in_dims[quant_axis];
-    const T* scale_factor = scale->data<T>();
+    const T *scale_factor = scale->data<T>();
     if (quant_axis == 0) {
       for (int64_t i = 0; i < channel; i++) {
         T s = scale_factor[i];
@@ -41,7 +44,7 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
         framework::Tensor one_channel_out = out->Slice(i, i + 1);
         auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
         auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
-        auto& dev = *dev_ctx.eigen_device();
+        auto &dev = *dev_ctx.eigen_device();
         out_e.device(dev) = in_e * s / max_range;
       }
     } else if (quant_axis == 1) {
@@ -51,12 +54,12 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
       }
       int64_t step_i = in->numel() / out_iter;
       int64_t step_j = in->numel() / (out_iter * channel);
-      auto* in_data = in->data<T>();
-      auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+      auto *in_data = in->data<T>();
+      auto *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
       for (int64_t i = 0; i < out_iter; i++) {
         for (int64_t j = 0; j < channel; j++) {
-          auto* cur_in = in_data + i * step_i + j * step_j;
-          auto* cur_out = out_data + i * step_i + j * step_j;
+          auto *cur_in = in_data + i * step_i + j * step_j;
+          auto *cur_out = out_data + i * step_i + j * step_j;
           T s = scale_factor[j];
           for (int64_t k = 0; k < step_j; k++) {
             *cur_out = (*cur_in) * s / max_range;
@@ -75,11 +78,11 @@ template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, double>;
 class QuantizeLinearOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "QuantizeLinear");
     OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "QuantizeLinear");
-    OP_INOUT_CHECK(ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint",
-                   "QuantizeLinear");
+    OP_INOUT_CHECK(
+        ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint", "QuantizeLinear");
     OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "QuantizeLinear");
     ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
     int quant_axis = ctx->Attrs().Get<int>("quant_axis");
@@ -95,7 +98,7 @@ class QuantizeLinearOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
   }
@@ -116,9 +119,10 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
                  "For conv2d, depthwise_conv2d, conv2d_transpose "
                  "and mul, the quant_axis is equal to the cout axis.")
         .SetDefault(0)
-        .AddCustomChecker([](const int& quant_axis) {
+        .AddCustomChecker([](const int &quant_axis) {
           PADDLE_ENFORCE_EQ(
-              quant_axis == 0 || quant_axis == 1 || quant_axis == -1, true,
+              quant_axis == 0 || quant_axis == 1 || quant_axis == -1,
+              true,
               platform::errors::InvalidArgument(
                   "'quant_axis' should be 0 or 1, but "
                   "the received is %d",
@@ -126,8 +130,9 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
-        .AddCustomChecker([](const int& bit_length) {
-          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+        .AddCustomChecker([](const int &bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
+                            true,
                             platform::errors::InvalidArgument(
                                 "'bit_length' should be between 1 and 16, but "
                                 "the received is %d",
@@ -140,13 +145,17 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
         "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
         "round(2.5)=3")
         .SetDefault(0)
-        .AddCustomChecker([](const int& round_type) {
-          PADDLE_ENFORCE_EQ(round_type >= 0 && round_type <= 1, true,
-                            platform::errors::InvalidArgument(
-                                "'round_type' should be between 0 and 1, but "
-                                "the received is %d",
-                                round_type));
-        });
+        .AddCustomChecker([](const int &round_type) {
+          PADDLE_ENFORCE_EQ(
+              round_type == 0 || round_type == 1,
+              true,
+              platform::errors::InvalidArgument(
+                  "'round_type' should be 0 or 1, 0 rounding to "
+                  "nearest ties to even and 1 is rounding to nearest "
+                  "ties away from zero.but the received is %d",
+                  round_type));
+        })
+        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
@@ -170,14 +179,18 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(
-    quantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
+    quantize_linear,
+    ops::QuantizeLinearOp,
+    ops::QuantizeLinearOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OP_CPU_KERNEL(quantize_linear, ops::QuantizeLinearKernel<CPU, float>);
 
 REGISTER_OPERATOR(
-    dequantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
+    dequantize_linear,
+    ops::QuantizeLinearOp,
+    ops::QuantizeLinearOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 9bcf3af1340..3926ee95036 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -121,8 +121,7 @@ class PostTrainingQuantization(object):
                  algo="KL",
                  hist_percent=0.99999,
                  quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
-                 weight_round_algo='round',
-                 round_type='TiesToEven',
+                 round_type='round',
                  learning_rate=0.001,
                  is_full_quantize=False,
                  bias_correction=False,
@@ -181,14 +180,10 @@ class PostTrainingQuantization(object):
             quantizable_op_type(list[str], optional): List the type of ops 
                 that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
                 "mul"].
-            weight_round_algo(str, optional): The method of converting the quantized weights
+            round_type(str, optional): The method of converting the quantized weights
                 value float->int. Currently supports ['round', 'adaround'] methods.
                 Default is `round`, which is rounding nearest to the integer.
                 'adaround' is refer to https://arxiv.org/abs/2004.10568.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
             learning_rate(float, optional): The learning rate of adaround method.
             is_full_quantized(bool, optional): If set is_full_quantized as True, 
                 apply quantization to all supported quantizable op type. If set
@@ -269,10 +264,8 @@ class PostTrainingQuantization(object):
         self._support_algo_type = [
             'KL', 'hist', 'avg', 'mse', 'emd', 'abs_max', 'min_max'
         ]
-        assert round_type in ['TiesToEven', 'TiesAwayFromZero']
+        assert round_type in ['adaround', 'round']
         self._round_type = round_type
-        assert weight_round_algo in ['adaround', 'round']
-        self._weight_round_algo = weight_round_algo
         self._learning_rate = learning_rate
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
@@ -414,7 +407,7 @@ class PostTrainingQuantization(object):
         if self._algo in ["KL", "hist"]:
             self._calculate_kl_hist_threshold()
 
-        if self._weight_round_algo == 'adaround':
+        if self._round_type == 'adaround':
             self._adaround_apply()
 
         self._reset_activation_persistable()
@@ -651,7 +644,6 @@ class PostTrainingQuantization(object):
                                 float(np.max(np.abs(var_tensor[i]))))
                 self._quantized_threshold[var_name] = abs_max_value
         _logger.info("MSE searching stage ...")
-        distribution = np.round if self._round_type == 'TiesToEven' else utils.round_c
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
             var_tensor = var_tensor.flatten()
@@ -664,9 +656,14 @@ class PostTrainingQuantization(object):
                 scale = s * abs_max_value
                 s += 0.02
                 bins = 2**(self._activation_bits - 1) - 1
-                quant_var = np.clip(distribution(var_tensor / scale * bins),
-                                    -bins - 1, bins)
-                quant_dequant_var = quant_var / bins * scale
+                if self._onnx_format:
+                    quant_var = np.clip(distribution(var_tensor / scale * bins),
+                                        -bins - 1, bins)
+                    quant_dequant_var = quant_var / bins * scale
+                else:
+                    quant_dequant_var = np.round(
+                        np.clip(var_tensor, 0.0, scale) / scale *
+                        bins) / bins * scale
                 mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
                 if mse_loss <= self._best_calibration_loss[var_name]:
                     self._best_calibration_loss[var_name] = mse_loss
@@ -691,7 +688,6 @@ class PostTrainingQuantization(object):
                                 float(np.max(np.abs(var_tensor[i]))))
                 self._quantized_threshold[var_name] = abs_max_value
         _logger.info("EMD searching stage ...")
-        distribution = np.round if self._round_type == 'TiesToEven' else utils.round_c
         for var_name in self._quantized_act_var_name:
             var_tensor = utils.load_variable_data(self._scope, var_name)
             var_tensor = var_tensor.flatten()
@@ -704,9 +700,14 @@ class PostTrainingQuantization(object):
                 scale = s * abs_max_value
                 s += 0.02
                 bins = 2**(self._activation_bits - 1) - 1
-                quant_var = np.clip(distribution(var_tensor / scale * bins),
-                                    -bins - 1, bins)
-                quant_dequant_var = quant_var / bins * scale
+                if self._onnx_format:
+                    quant_var = np.clip(distribution(var_tensor / scale * bins),
+                                        -bins - 1, bins)
+                    quant_dequant_var = quant_var / bins * scale
+                else:
+                    quant_dequant_var = np.round(
+                        np.clip(var_tensor, 0.0, scale) / scale *
+                        bins) / bins * scale
                 emd_loss = np.abs(
                     np.mean(var_tensor) - np.mean(quant_dequant_var)) + np.abs(
                         np.std(var_tensor) - np.std(quant_dequant_var))
@@ -918,8 +919,7 @@ class PostTrainingQuantization(object):
                 activation_bits=self._activation_bits,
                 activation_quantize_type=self._activation_quantize_type,
                 weight_quantize_type=self._weight_quantize_type,
-                quantizable_op_type=major_quantizable_op_types,
-                round_type=self._round_type)
+                quantizable_op_type=major_quantizable_op_types)
         else:
             transform_pass = QuantizationTransformPassV2(
                 scope=self._scope,
@@ -928,8 +928,7 @@ class PostTrainingQuantization(object):
                 activation_bits=self._activation_bits,
                 activation_quantize_type=self._activation_quantize_type,
                 weight_quantize_type=self._weight_quantize_type,
-                quantizable_op_type=major_quantizable_op_types,
-                round_type=self._round_type)
+                quantizable_op_type=major_quantizable_op_types)
 
         for sub_graph in graph.all_sub_graphs():
             # Insert fake_quant/fake_dequantize op must in test graph, so
@@ -946,15 +945,13 @@ class PostTrainingQuantization(object):
             add_quant_dequant_pass = AddQuantDequantPass(
                 scope=self._scope,
                 place=self._place,
-                quantizable_op_type=minor_quantizable_op_types,
-                round_type=self._round_type)
+                quantizable_op_type=minor_quantizable_op_types)
         else:
             add_quant_dequant_pass = AddQuantDequantPassV2(
                 scope=self._scope,
                 place=self._place,
                 quantizable_op_type=minor_quantizable_op_types,
-                is_full_quantized=self._is_full_quantize,
-                round_type=self._round_type)
+                is_full_quantized=self._is_full_quantize)
 
         for sub_graph in graph.all_sub_graphs():
             sub_graph._for_test = True
@@ -979,7 +976,6 @@ class PostTrainingQuantization(object):
                 place=self._place,
                 bias_correction=self._bias_correction,
                 weight_bits=self._weight_bits,
-                weight_round_algo=self._weight_round_algo,
                 round_type=self._round_type,
                 activation_bits=self._activation_bits,
                 weight_quantize_type=self._weight_quantize_type,
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 08d507284e4..3a316e9192e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -119,7 +119,6 @@ class QuantizationTransformPass(object):
                  moving_rate=0.9,
                  skip_pattern=['skip_quant'],
                  quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
-                 round_type='TiesToEven',
                  weight_quantize_func=None,
                  act_quantize_func=None,
                  weight_preprocess_func=None,
@@ -157,10 +156,6 @@ class QuantizationTransformPass(object):
             quantizable_op_type(list[str]): List the type of ops that will be quantized. 
                 Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                 QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
             weight_quantize_func(function): Function that defines how to quantize weight.
                 Using this can quickly test if user's quantization method works or not.
                 In this function, user should both define quantization function and
@@ -211,7 +206,6 @@ class QuantizationTransformPass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._skip_pattern = skip_pattern
-        self._round_type = round_type
         self._weight_quantize_func = weight_quantize_func
         self._act_quantize_func = act_quantize_func
         self._weight_preprocess_func = weight_preprocess_func
@@ -465,12 +459,10 @@ class QuantizationTransformPass(object):
         _init_var_node(scale_var_node,
                        np.zeros(scale_var_node.shape(), dtype=data_type),
                        self._scope, self._place)
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
             attrs={
                 'bit_length': quant_bits,
-                'round_type': round_type,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
@@ -525,11 +517,9 @@ class QuantizationTransformPass(object):
 
             inputs['Iter'] = self._global_step
             outputs['OutScales'] = scales_node
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
         attrs = {
             'window_size': self._window_size,
             'bit_length': quant_bits,
-            'round_type': round_type,
             'is_test': self._is_test,
             'op_role': core.op_proto_and_checker_maker.OpRole.Forward
         }
@@ -600,10 +590,8 @@ class QuantizationTransformPass(object):
             outs['OutState'] = state_out_node
             outs['OutAccum'] = accum_out_node
 
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
         attrs = {
             'bit_length': quant_bits,
-            'round_type': round_type,
             'moving_rate': self._moving_rate,
             'is_test': self._is_test,
             'op_role': core.op_proto_and_checker_maker.OpRole.Forward
@@ -650,12 +638,10 @@ class QuantizationTransformPass(object):
         _init_var_node(scale_var_node,
                        np.zeros(scale_var_node.shape(), dtype=data_type),
                        self._scope, self._place)
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
         quant_op_node = graph.create_op_node(
             op_type='fake_channel_wise_quantize_abs_max',
             attrs={
                 'bit_length': quant_bits,
-                'round_type': round_type,
                 'quant_axis': quant_axis,
                 'is_test': self._is_test,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
@@ -949,8 +935,7 @@ class QuantizationFreezePass(object):
                  bias_correction=False,
                  weight_bits=8,
                  activation_bits=8,
-                 weight_round_algo='round',
-                 round_type='TiesToEven',
+                 round_type='round',
                  weight_quantize_type='abs_max',
                  quantizable_op_type=None):
         """
@@ -968,14 +953,10 @@ class QuantizationFreezePass(object):
                  https://arxiv.org/abs/1810.05723.
             weight_bits(int): quantization bit number for weights.
             activation_bits(int): quantization bit number for activation.
-            weight_round_algo(str, optional): The method of converting the quantized weights
+            round_type(str, optional): The method of converting the quantized weights
                 value float->int. Currently supports ['round', 'adaround'] methods.
                 Default is `round`, which is rounding nearest to the integer.
                 'adaround' is refer to https://arxiv.org/abs/2004.10568.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
             weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
                 'channel_wise_abs_max'. The 'range_abs_max' usually is not used for weight, 
                 since weights are fixed once the model is well trained.
@@ -991,7 +972,6 @@ class QuantizationFreezePass(object):
         self._place = _get_paddle_place(place)
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
-        self._weight_round_algo = weight_round_algo
         self._round_type = round_type
         self._weight_quantize_type = weight_quantize_type
         self._fake_quant_op_names = _fake_quant_op_list
@@ -1039,7 +1019,7 @@ class QuantizationFreezePass(object):
                         scale_v = scale_v.tolist()
                     self._quant_var_scale_map[input_arg_name] = scale_v
                     # Quantize weight and restore
-                    if self._weight_round_algo == 'round':
+                    if self._round_type == 'round':
                         param_v = self._load_var(input_arg_name)
                         if any(
                                 _check_grandchild_op_node(op_node, op)
@@ -1049,7 +1029,8 @@ class QuantizationFreezePass(object):
                             quant_axis = 0
                         quantized_param_v = utils.quant_tensor(
                             param_v.copy(), scale_v, quant_axis,
-                            self._weight_bits, self._round_type)
+                            self._weight_bits)
+                        quantized_param_v = np.round(quantized_param_v)
                         # Weight bias correction
                         if self._bias_correction == True:
                             quantized_param_v = utils.bias_correction_w(
@@ -1058,6 +1039,7 @@ class QuantizationFreezePass(object):
                                 scale_v,
                                 quant_axis,
                                 weight_bits=self._weight_bits)
+                            quantized_param_v = np.round(quantized_param_v)
                         self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
@@ -1600,8 +1582,7 @@ class AddQuantDequantPass(object):
                  quant_bits=8,
                  skip_pattern=["skip_quant"],
                  quantizable_op_type=["elementwise_add", "pool2d"],
-                 is_full_quantized=False,
-                 round_type='TiesToEven'):
+                 is_full_quantized=False):
         """
         Constructor.
 
@@ -1623,10 +1604,6 @@ class AddQuantDequantPass(object):
                 quantization to all supported quantizable op type. If set is_full_quantized
                 as False, only apply quantization to the op type according to the input 
                 quantizable_op_type.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
         """
         self._scope = scope
         self._place = _get_paddle_place(place)
@@ -1634,7 +1611,6 @@ class AddQuantDequantPass(object):
         self._quant_bits = quant_bits
         self._is_test = None
         self._skip_pattern = skip_pattern
-        self._round_type = round_type
 
         if is_full_quantized:
             self._quantizable_op_type = utils._act_supported_quantizable_op_type
@@ -1769,10 +1745,8 @@ class AddQuantDequantPass(object):
             outs['OutState'] = state_out_node
             outs['OutAccum'] = accum_out_node
 
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
         attrs = {
             'bit_length': quant_bits,
-            'round_type': round_type,
             'moving_rate': self._moving_rate,
             'is_test': self._is_test,
             'op_role': core.op_proto_and_checker_maker.OpRole.Forward
@@ -1812,10 +1786,6 @@ class InsertQuantizeLinear(object):
             Default is -1.
         channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
         is_test(bool, optional): Whether quantization with training or not. Default is True.
-        round_type(str, optional): The method of converting the tensor value float->int.
-            Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-            Default is `TiesToEven`, which is rounding to nearest ties to even. 
-            'TiesAwayFromZero' is rounding to nearest ties away from zero.
     """
 
     def __init__(self,
@@ -1824,15 +1794,13 @@ class InsertQuantizeLinear(object):
                  quant_bits=8,
                  quant_axis=-1,
                  channel_wise=False,
-                 is_test=True,
-                 round_type='TiesToEven'):
+                 is_test=True):
         self._place = place
         self._scope = scope
         self.quant_bits = quant_bits
         self.quant_axis = quant_axis
         self.channel_wise = channel_wise
         self._is_test = is_test
-        self._round_type = round_type
 
     def insert_quant_op(self, graph, var_node):
         assert var_node.is_var(), '{} is not a var'.format(var_node.name())
@@ -1875,12 +1843,7 @@ class InsertQuantizeLinear(object):
         if zero_point_node is not None:
             inputs["ZeroPoint"] = zero_point_node
 
-        round_type = 0 if self._round_type == 'TiesToEven' else 1
-        attrs = {
-            "quant_axis": self.quant_axis,
-            "bit_length": self.quant_bits,
-            "round_type": round_type
-        }
+        attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
         outputs = {"Y": quant_var_node}
         if not self._is_test:
             attrs["is_test"] = self._is_test
@@ -1985,7 +1948,6 @@ class QuantizationTransformPassV2(object):
                  moving_rate=0.9,
                  skip_pattern=['skip_quant'],
                  quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
-                 round_type='TiesToEven',
                  weight_quantize_func=None,
                  act_quantize_func=None,
                  weight_preprocess_func=None,
@@ -2021,10 +1983,6 @@ class QuantizationTransformPassV2(object):
             quantizable_op_type(list[str]): List the type of ops that will be quantized. 
                 Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
                 QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
             weight_quantize_func(function): Function that defines how to quantize weight.
                 Using this can quickly test if user's quantization method works or not.
                 In this function, user should both define quantization function and
@@ -2074,7 +2032,6 @@ class QuantizationTransformPassV2(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._skip_pattern = skip_pattern
-        self._round_type = round_type
         self._weight_quantize_func = weight_quantize_func
         self._act_quantize_func = act_quantize_func
         self._weight_preprocess_func = weight_preprocess_func
@@ -2198,8 +2155,7 @@ class QuantizationTransformPassV2(object):
                     quant_bits=quant_bits,
                     quant_axis=quant_axis,
                     channel_wise=channel_wise,
-                    is_test=self._is_test,
-                    round_type=self._round_type)
+                    is_test=self._is_test)
                 quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                     graph, var_node)
                 dequant_var_node = insert_quant_pass.insert_dequant_op(
@@ -2307,8 +2263,7 @@ class AddQuantDequantPassV2(object):
                  quant_bits=8,
                  skip_pattern=["skip_quant"],
                  quantizable_op_type=["elementwise_add", "pool2d"],
-                 is_full_quantized=False,
-                 round_type='TiesToEven'):
+                 is_full_quantized=False):
         """
         Args:
             scope(paddle.Scope): The scope is used to initialize these new parameters.
@@ -2328,10 +2283,6 @@ class AddQuantDequantPassV2(object):
                 quantization to all supported quantizable op type. If set is_full_quantized
                 as False, only apply quantization to the op type according to the input 
                 quantizable_op_type.
-            round_type(str, optional): The method of converting the tensor value float->int.
-                Currently supports ['TiesToEven', 'TiesAwayFromZero'] methods.
-                Default is `TiesToEven`, which is rounding to nearest ties to even. 
-                'TiesAwayFromZero' is rounding to nearest ties away from zero.
         
         Examples:
         .. code-block:: python
@@ -2354,7 +2305,6 @@ class AddQuantDequantPassV2(object):
         self._quant_bits = quant_bits
         self._is_test = None
         self._skip_pattern = skip_pattern
-        self._round_type = round_type
 
         if is_full_quantized:
             self._quantizable_op_type = utils._act_supported_quantizable_op_type
@@ -2427,8 +2377,7 @@ class AddQuantDequantPassV2(object):
                                 quant_bits=self._quant_bits,
                                 quant_axis=-1,
                                 channel_wise=False,
-                                is_test=self._is_test,
-                                round_type=self._round_type)
+                                is_test=self._is_test)
                             quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                                 graph, in_node)
                             dequant_var_node = insert_quant_pass.insert_dequant_op(
@@ -2511,8 +2460,6 @@ class ReplaceFakeQuantDequantPass(object):
             "quant_axis") else -1
         bit_length = op.op().attr("bit_length") if op.op().has_attr(
             "bit_length") else 8
-        round_type = op.op().attr("round_type") if op.op().has_attr(
-            "round_type") else 0
 
         zero_point_node = None
         quanted_node = x_node
@@ -2534,8 +2481,7 @@ class ReplaceFakeQuantDequantPass(object):
         quant_op_node = graph.create_op_node(op_type="quantize_linear",
                                              attrs={
                                                  "quant_axis": quant_axis,
-                                                 "bit_length": bit_length,
-                                                 "round_type": round_type
+                                                 "bit_length": bit_length
                                              },
                                              inputs={
                                                  "X": x_node,
@@ -2654,11 +2600,11 @@ class QuantWeightPass(object):
                 param_v = self._load_var(x_node.name())
                 quant_axis = _op.op().attr("quant_axis")
                 bits_length = _op.op().attr("bit_length")
-                round_type = _op.op().attr("round_type") if _op.op().has_attr(
-                    "round_type") else 0
-                quantized_param_v = utils.quant_tensor(param_v.copy(), scale_v,
-                                                       quant_axis, bits_length,
-                                                       round_type)
+                quantized_param_v = utils.quant_tensor(param_v.copy(),
+                                                       scale_v,
+                                                       quant_axis,
+                                                       bits_length,
+                                                       onnx_format=True)
                 if self._bias_correction == True:
                     quantized_param_v = utils.bias_correction_w(
                         param_v,
diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py
index e396ce9dee2..28efcd2d591 100644
--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -321,39 +321,41 @@ def set_variable_data(scope, place, var_name, np_value):
         tensor.set(np_value, place)
 
 
-def round_c_single_element(val):
-    dtype = type(val)
-    if val >= 0:
-        return dtype(np.floor(val + 0.5))
-    return dtype(np.ceil(val - 0.5))
+def quant_tensor(x, scale, quant_axis=0, weight_bits=8, onnx_format=False):
+    # symmetry quant
+    def _clip(x, scale):
+        x[x > scale] = scale
+        x[x < -scale] = -scale
+        return x
 
-
-# rounding to nearest ties away from zero
-round_c = np.vectorize(round_c_single_element)
-
-
-def quant_tensor(x,
-                 scale,
-                 quant_axis=0,
-                 weight_bits=8,
-                 round_type='TiesToEven'):
     assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
-    distribution = np.round if round_type == 'TiesToEven' else round_c
     bnt = (1 << (weight_bits - 1)) - 1
     if isinstance(scale, list):
         for i, s in enumerate(scale):
             if s == 0.0:
                 s = 1e-8
             if quant_axis == 0:
-                x[i] = distribution(x[i] / s * bnt)
-                x[i] = np.clip(x[i], -bnt - 1, bnt)
+                if onnx_format:
+                    x[i] = np.round(x[i] / s * bnt)
+                    x[i] = np.clip(x[i], -bnt - 1, bnt)
+                else:
+                    x[i] = _clip(x[i], s)
+                    x[i] = x[i] / s * bnt
             else:
-                x[:, i] = distribution(x[:, i] / s * bnt)
-                x[:, i] = np.clip(x[:, i], -bnt - 1, bnt)
+                if onnx_format:
+                    x[:, i] = np.round(x[:, i] / s * bnt)
+                    x[:, i] = np.clip(x[:, i], -bnt - 1, bnt)
+                else:
+                    x[:, i] = _clip(x[:, i], s)
+                    x[:, i] = x[:, i] / s * bnt
     else:
         scale = 1e-8 if scale == 0.0 else scale
-        x = distribution(x / scale * bnt)
-        x = np.clip(x, -bnt - 1, bnt)
+        if onnx_format:
+            x = np.round(x / scale * bnt)
+            x = np.clip(x, -bnt - 1, bnt)
+        else:
+            x = _clip(x, scale)
+            x = x / scale * bnt
     return x
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
index befc76c0275..6100ed4f82a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -165,7 +165,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                  model_path,
                                  data_path,
                                  algo="KL",
-                                 weight_round_algo="round",
+                                 round_type="round",
                                  quantizable_op_type=["conv2d"],
                                  is_full_quantize=False,
                                  is_use_cache_file=False,
@@ -185,7 +185,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                        batch_nums=batch_nums,
                                        algo=algo,
                                        quantizable_op_type=quantizable_op_type,
-                                       weight_round_algo=weight_round_algo,
+                                       round_type=round_type,
                                        is_full_quantize=is_full_quantize,
                                        optimize_model=is_optimize_model,
                                        onnx_format=onnx_format,
@@ -201,7 +201,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                  data_url,
                  data_md5,
                  algo,
-                 weight_round_algo,
+                 round_type,
                  quantizable_op_type,
                  is_full_quantize,
                  is_use_cache_file,
@@ -224,7 +224,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
         print("Start post training quantization for {0} on {1} samples ...".
               format(model_name, quant_iterations))
         self.generate_quantized_model(fp32_model_path, data_path, algo,
-                                      weight_round_algo, quantizable_op_type,
+                                      round_type, quantizable_op_type,
                                       is_full_quantize, is_use_cache_file,
                                       is_optimize_model, quant_iterations,
                                       onnx_format)
@@ -255,7 +255,7 @@ class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
         algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["mul", "lstm"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -264,7 +264,7 @@ class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
         infer_iterations = 100
         quant_iterations = 10
         self.run_test(model_name, model_url, model_md5, data_name, data_url,
-                      data_md5, algo, weight_round_algo, quantizable_op_type,
+                      data_md5, algo, round_type, quantizable_op_type,
                       is_full_quantize, is_use_cache_file, is_optimize_model,
                       diff_threshold, infer_iterations, quant_iterations)
 
@@ -279,7 +279,7 @@ class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
         algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["mul", "lstm"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -295,7 +295,7 @@ class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
                       data_url,
                       data_md5,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       quantizable_op_type,
                       is_full_quantize,
                       is_use_cache_file,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 71e974f8981..807bdbf8a9a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -108,7 +108,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
     def generate_quantized_model(self,
                                  model_path,
                                  algo="KL",
-                                 weight_round_algo="round",
+                                 round_type="round",
                                  quantizable_op_type=["conv2d"],
                                  is_full_quantize=False,
                                  is_use_cache_file=False,
@@ -130,7 +130,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                        batch_nums=batch_nums,
                                        algo=algo,
                                        quantizable_op_type=quantizable_op_type,
-                                       weight_round_algo=weight_round_algo,
+                                       round_type=round_type,
                                        is_full_quantize=is_full_quantize,
                                        optimize_model=is_optimize_model,
                                        bias_correction=bias_correction,
@@ -145,7 +145,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                  data_url,
                  data_md5,
                  algo,
-                 weight_round_algo,
+                 round_type,
                  quantizable_op_type,
                  is_full_quantize,
                  is_use_cache_file,
@@ -169,11 +169,10 @@ class TestPostTrainingQuantization(unittest.TestCase):
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
-        self.generate_quantized_model(origin_model_path, algo,
-                                      weight_round_algo, quantizable_op_type,
-                                      is_full_quantize, is_use_cache_file,
-                                      is_optimize_model, batch_size,
-                                      quant_iterations, onnx_format,
+        self.generate_quantized_model(origin_model_path, algo, round_type,
+                                      quantizable_op_type, is_full_quantize,
+                                      is_use_cache_file, is_optimize_model,
+                                      batch_size, quant_iterations, onnx_format,
                                       skip_tensor_list, bias_correction)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
@@ -204,7 +203,7 @@ class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "KL"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -213,7 +212,7 @@ class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -226,7 +225,7 @@ class TestPostTraininghistForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "hist"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -235,7 +234,7 @@ class TestPostTraininghistForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -248,7 +247,7 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "mse"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -257,7 +256,7 @@ class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -270,7 +269,7 @@ class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "emd"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -279,7 +278,7 @@ class TestPostTrainingemdForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -292,7 +291,7 @@ class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -301,7 +300,7 @@ class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -314,7 +313,7 @@ class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "abs_max"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "mul"]
         is_full_quantize = True
         is_use_cache_file = False
@@ -323,7 +322,7 @@ class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 10
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -336,7 +335,7 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "mse"
-        weight_round_algo = "adaround"
+        round_type = "adaround"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -350,7 +349,7 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
                       data_url,
                       data_md5,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       quantizable_op_type,
                       is_full_quantize,
                       is_use_cache_file,
@@ -369,7 +368,7 @@ class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "KL"
-        weight_round_algo = "adaround"
+        round_type = "adaround"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -378,7 +377,7 @@ class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
         batch_size = 10
         infer_iterations = 50
         quant_iterations = 5
-        self.run_test(model_name, data_url, data_md5, algo, weight_round_algo,
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold, batch_size,
                       infer_iterations, quant_iterations)
@@ -391,7 +390,7 @@ class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "mse"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -405,7 +404,7 @@ class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
                       data_url,
                       data_md5,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       quantizable_op_type,
                       is_full_quantize,
                       is_use_cache_file,
@@ -425,7 +424,7 @@ class TestPostTrainingmseForMnistONNXFormatFullQuant(
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "mse"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = True
         is_use_cache_file = False
@@ -439,7 +438,7 @@ class TestPostTrainingmseForMnistONNXFormatFullQuant(
                       data_url,
                       data_md5,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       quantizable_op_type,
                       is_full_quantize,
                       is_use_cache_file,
@@ -458,7 +457,7 @@ class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
         data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
         data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
         algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
         quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
         is_full_quantize = False
         is_use_cache_file = False
@@ -472,7 +471,7 @@ class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
                       data_url,
                       data_md5,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       quantizable_op_type,
                       is_full_quantize,
                       is_use_cache_file,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index fac0dcc3413..9c076d85fd2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -242,7 +242,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                  model_path,
                                  quantizable_op_type,
                                  algo="KL",
-                                 weight_round_algo="round",
+                                 round_type="round",
                                  is_full_quantize=False,
                                  is_use_cache_file=False,
                                  is_optimize_model=False,
@@ -264,7 +264,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                        model_dir=model_path,
                                        algo=algo,
                                        quantizable_op_type=quantizable_op_type,
-                                       weight_round_algo=weight_round_algo,
+                                       round_type=round_type,
                                        is_full_quantize=is_full_quantize,
                                        optimize_model=is_optimize_model,
                                        onnx_format=onnx_format,
@@ -275,7 +275,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
     def run_test(self,
                  model,
                  algo,
-                 weight_round_algo,
+                 round_type,
                  data_urls,
                  data_md5s,
                  quantizable_op_type,
@@ -299,10 +299,9 @@ class TestPostTrainingQuantization(unittest.TestCase):
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model, sample_iterations * batch_size))
         self.generate_quantized_model(model_cache_folder + "/model",
-                                      quantizable_op_type, algo,
-                                      weight_round_algo, is_full_quantize,
-                                      is_use_cache_file, is_optimize_model,
-                                      onnx_format)
+                                      quantizable_op_type, algo, round_type,
+                                      is_full_quantize, is_use_cache_file,
+                                      is_optimize_model, onnx_format)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
@@ -330,7 +329,7 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_kl_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "KL"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
@@ -345,7 +344,7 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.025
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
 
@@ -355,7 +354,7 @@ class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_avg_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
@@ -369,7 +368,7 @@ class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.025
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
 
@@ -379,7 +378,7 @@ class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_hist_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "hist"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
@@ -393,7 +392,7 @@ class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.03
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
 
@@ -403,7 +402,7 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_abs_max_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "abs_max"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
@@ -417,7 +416,7 @@ class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
         is_optimize_model = False
         # The accuracy diff of post-training quantization (abs_max) maybe bigger
         diff_threshold = 0.05
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
 
@@ -427,7 +426,7 @@ class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_onnx_format_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "avg"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
@@ -444,7 +443,7 @@ class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
         diff_threshold = 0.05
         self.run_test(model,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       data_urls,
                       data_md5s,
                       quantizable_op_type,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
index 78c5153b742..c79499100ce 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -25,7 +25,7 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
     def test_post_training_resnet50(self):
         model = "ResNet-50"
         algo = "min_max"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
         ]
@@ -35,7 +35,7 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
         is_use_cache_file = False
         is_optimize_model = False
         diff_threshold = 0.025
-        self.run_test(model, algo, weight_round_algo, data_urls, data_md5s,
+        self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
 
@@ -45,7 +45,7 @@ class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
     def test_post_training_resnet50(self):
         model = "ResNet-50"
         algo = "min_max"
-        weight_round_algo = "round"
+        round_type = "round"
         data_urls = [
             'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
         ]
@@ -58,7 +58,7 @@ class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
         onnx_format = True
         self.run_test(model,
                       algo,
-                      weight_round_algo,
+                      round_type,
                       data_urls,
                       data_md5s,
                       quantizable_op_type,
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index e76d5c49d9e..02fff35fec7 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -49,7 +49,7 @@ class TestFakeQuantizeAbsMaxOp(OpTest):
                                dtype,
                                input_shape,
                                distribution,
-                               round_type='TiesToEven'):
+                               round_type='TiesAwayFromZero'):
         input_data = distribution(input_shape).astype(dtype)
         compute_type = get_compute_type(dtype)
         scale = np.max(np.abs(input_data))
@@ -58,12 +58,12 @@ class TestFakeQuantizeAbsMaxOp(OpTest):
         if round_type == 'TiesToEven':
             round_out = np.round(
                 input_data.astype(compute_type) * inv_scale * bnt)
+            output_data = np.clip(round_out, -bnt - 1, bnt)
             self.attrs['round_type'] = 0
         else:
-            round_out = round_c(
+            output_data = round_c(
                 input_data.astype(compute_type) * inv_scale * bnt)
             self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt)
         self.inputs = {'X': input_data}
         self.outputs = {'Out': output_data, 'OutScale': scale}
         self.dtype = dtype
@@ -75,7 +75,7 @@ class TestFakeQuantizeAbsMaxOp(OpTest):
     def test_fake_quantize_abs_max_round1(self):
         self._fake_quantize_abs_max(np.float32, (124, 240),
                                     np.random.random,
-                                    round_type='TiesAwayFromZero')
+                                    round_type='TiesToEven')
 
     def test_fake_quantize_abs_max_float16(self):
         self._fake_quantize_abs_max(np.float16, (124, 240), np.random.random)
@@ -110,12 +110,12 @@ class TestFakeChannelWiseQuantizeAbsMaxOp(OpTest):
         if round_type == 'TiesToEven':
             round_out = np.round(
                 input_data.astype(compute_type) / scale_broadcast * bnt)
+            output_data = np.clip(round_out, -bnt - 1, bnt)
             self.attrs['round_type'] = 0
         else:
-            round_out = round_c(
-                input_data.astype(compute_type) / scale_broadcast * bnt)
+            output_data = round_c(bnt * input_data.astype(compute_type) /
+                                  scale_broadcast)
             self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt)
         if quant_axis == 1:
             scale_broadcast = np.transpose(scale_broadcast,
                                            (1, ) + compute_axis)
@@ -169,11 +169,15 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
             round_out = np.round(
                 input_data.astype(compute_type) / out_scale[0] * bnt)
             self.attrs['round_type'] = 0
+            output_data = np.clip(round_out, -bnt - 1, bnt)
         else:
-            round_out = round_c(
-                input_data.astype(compute_type) / out_scale[0] * bnt)
+            if is_test:
+                clip_data = np.clip(input_data, -in_scale, in_scale)
+            else:
+                clip_data = input_data
+            output_data = round_c(
+                clip_data.astype(compute_type) / out_scale[0] * bnt)
             self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt)
         self.inputs = {
             'X': input_data,
             'Iter': np.zeros(1).astype(np.int64),
@@ -250,7 +254,7 @@ class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
                                               distribution,
                                               dequantize=False,
                                               with_gradient=False,
-                                              round_type='TiesToEven'):
+                                              round_type='TiesAwayFromZero'):
         input_data = distribution(input_shape).astype(dtype)
         compute_type = get_compute_type(dtype)
         bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
@@ -267,12 +271,12 @@ class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
         if round_type == 'TiesToEven':
             round_out = np.round(
                 input_data.astype(compute_type) / out_scale * bnt)
+            quant_data = np.clip(round_out, -bnt - 1, bnt)
             self.attrs['round_type'] = 0
         else:
-            round_out = round_c(
+            quant_data = round_c(
                 input_data.astype(compute_type) / out_scale * bnt)
             self.attrs['round_type'] = 1
-        quant_data = np.clip(round_out, -bnt - 1, bnt)
         if dequantize:
             output_data = (quant_data * out_scale / bnt).astype(dtype)
             self.op_type = 'fake_quantize_dequantize_moving_average_abs_max'
@@ -307,10 +311,9 @@ class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
                                                    np.random.random)
 
     def test_fake_quantize_moving_average_abs_max_round1(self):
-        self._fake_quantize_moving_average_abs_max(
-            np.float32, (8, 16, 7, 7),
-            np.random.random,
-            round_type='TiesAwayFromZero')
+        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
+                                                   np.random.random,
+                                                   round_type='TiesToEven')
 
     def test_fake_quantize_dequantize_moving_average_abs_max(self):
         self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
@@ -329,17 +332,17 @@ class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
                                           dtype,
                                           input_shape,
                                           distribution,
-                                          round_type='TiesToEven'):
+                                          round_type='TiesAwayFromZero'):
         input_data = distribution(input_shape).astype(dtype)
         scale = np.max(np.abs(input_data)).astype(dtype)
         bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
         if round_type == 'TiesToEven':
             round_out = np.round(input_data / scale * bnt)
+            output_data = np.clip(round_out, -bnt - 1, bnt) * scale / bnt
             self.attrs['round_type'] = 0
         else:
-            round_out = round_c(input_data / scale * bnt)
+            output_data = round_c(input_data / scale * bnt) * scale / bnt
             self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt) * scale / bnt
         self.inputs = {'X': input_data}
         self.outputs = {
             'Out': output_data,
@@ -357,7 +360,7 @@ class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def test_fake_quantize_dequantize_abs_max_round1(self):
         self._fake_quantize_dequantize_abs_max(np.float32, (124, 240),
                                                np.random.random,
-                                               round_type='TiesAwayFromZero')
+                                               round_type='TiesToEven')
 
 
 class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
@@ -382,11 +385,13 @@ class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
         scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
         if round_type == 'TiesToEven':
             round_out = np.round(bnt * output_data / scale_broadcast)
+            output_data = np.clip(round_out, -bnt - 1,
+                                  bnt) * scale_broadcast / bnt
             self.attrs['round_type'] = 0
         else:
-            round_out = round_c(bnt * output_data / scale_broadcast)
+            output_data = round_c(
+                bnt * output_data / scale_broadcast) * scale_broadcast / bnt
             self.attrs['round_type'] = 1
-        output_data = np.clip(round_out, -bnt - 1, bnt) * scale_broadcast / bnt
         if quant_axis == 1:
             scale_broadcast = np.transpose(scale_broadcast,
                                            (1, ) + compute_axis)
-- 
GitLab