add new format of quantization (#41041)

b72a7ebb · Guanghua Yu · GitHub · b9ee846e · b72a7ebb · b72a7ebb
24 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -102,10 +102,11 @@ endif()
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
-register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
+op_library(quantize_linear_op DEPS cast_kernel)
 op_library(save_combine_op DEPS string_array)
 op_library(load_combine_op DEPS string_array)

--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -12,142 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 #include "paddle/fluid/operators/fake_dequantize_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
-                             T* out) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < num) {
-    out[idx] = in[idx] * scale[0] / max_range;
-  }
-}
-template <typename T>
-struct DequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor* scale,
-                  T max_range, framework::Tensor* out) {
-    const T* in_data = in->data<T>();
-    const T* scale_factor = scale->data<T>();
-    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-    int num = in->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        in_data, scale_factor, max_range, num, out_data);
-  }
-};
-template <typename T>
-__global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
-                                             T max_range, int num, int channel,
-                                             T* out) {
-  int tid = threadIdx.x;
-  int channel_size = num / channel;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
-  }
-}
-template <typename T>
-__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
-                                             const T max_range,
-                                             const int64_t num,
-                                             const int n_scales,
-                                             const int quant_stride, T* out) {
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
-    T s = scale[(i / quant_stride) % n_scales];
-    out[i] = in[i] * s / max_range;
-  }
-}
-template <typename T>
-__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
-                                   const T* scale_two, T max_range, int num,
-                                   int iter_size, int channel, T* out) {
-  int tid = threadIdx.x;
-  int channel_size = num / (iter_size * channel);
-  int scale_index = blockIdx.x % channel;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
-  }
-}
-template <typename T>
-struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const framework::Tensor* in, const framework::Tensor** scales,
-                  const int scale_num, T max_range, const int quant_axis,
-                  const int x_num_col_dims, framework::Tensor* out) {
-    auto in_dims = in->dims();
-    const T* in_data = in->data<T>();
-    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-    if (scale_num == 1) {
-      int64_t num = in->numel();
-      const T* scale_factor = scales[0]->data<T>();
-      if (quant_axis == 0) {
-        int grid = in_dims[0];
-        int block = 1024;
-        DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
-            in_data, scale_factor, max_range, num, in_dims[0], out_data);
-      } else {
-        int quant_stride = 1;
-        for (int i = quant_axis + 1; i < in_dims.size(); i++) {
-          quant_stride *= in_dims[i];
-        }
-        int64_t block_size = std::min(
-            num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
-        int64_t max_threads =
-            dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
-        const int64_t max_blocks = std::max(
-            ((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
-        const int64_t grid_size =
-            std::min(max_blocks, (num + block_size - 1) / block_size);
-        DequantizeOneScaleQuantAxisN<
-            T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            in_data, scale_factor, max_range, num, in_dims[quant_axis],
-            quant_stride, out_data);
-      }
-    } else if (scale_num == 2) {
-      // Not need to consider quant_axis
-      int num = in->numel();
-      int iter_size = 1;
-      for (int i = 0; i < x_num_col_dims; i++) {
-        iter_size *= in->dims()[i];
-      }
-      int channel = in->dims()[x_num_col_dims];
-      const T* scale_one = scales[0]->data<T>();
-      const T* scale_two = scales[1]->data<T>();
-      int block = 1024;
-      int grid = iter_size * channel;
-      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          in_data, scale_one, scale_two, max_range, num, iter_size, channel,
-          out_data);
-    }
-  }
-};
-template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
-template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
-template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
-template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
-}  // namespace operators
-}  // namespace paddle
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,

--- a/paddle/fluid/operators/fake_dequantize_op.cu.h
+++ b/paddle/fluid/operators/fake_dequantize_op.cu.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_FLUID_OPERATORS_FAKE_DEQUANTIZE_OP_CU_H_
+#define PADDLE_FLUID_OPERATORS_FAKE_DEQUANTIZE_OP_CU_H_
+#endif  // PADDLE_FLUID_OPERATORS_FAKE_DEQUANTIZE_OP_CU_H_
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void KeDequantize(const T* in, const T* scale, T max_range,
+                             int64_t num, T* out) {
+  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * scale[0] / max_range;
+  }
+}
+template <typename T>
+struct DequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    const T* scale_factor = scale->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    int64_t num = in->numel();
+    int64_t block_size = std::min(
+        num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+    int64_t max_threads =
+        dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (num + block_size - 1) / block_size);
+    KeDequantize<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+        in_data, scale_factor, max_range, num, out_data);
+  }
+};
+template <typename T>
+__global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
+                                             T max_range, int num, int channel,
+                                             T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
+  }
+}
+template <typename T>
+__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
+                                             const T max_range,
+                                             const int64_t num,
+                                             const int n_scales,
+                                             const int quant_stride, T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % n_scales];
+    out[i] = in[i] * s / max_range;
+  }
+}
+template <typename T>
+__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
+                                   const T* scale_two, T max_range, int num,
+                                   int iter_size, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / (iter_size * channel);
+  int scale_index = blockIdx.x % channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
+  }
+}
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, const int quant_axis,
+                  const int x_num_col_dims, framework::Tensor* out) {
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    if (scale_num == 1) {
+      int64_t num = in->numel();
+      const T* scale_factor = scales[0]->data<T>();
+      int64_t block_size = std::min(
+          num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+      int64_t max_threads =
+          dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+      const int64_t grid_size =
+          std::min(max_blocks, (num + block_size - 1) / block_size);
+      int quant_stride = 1;
+      for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+        quant_stride *= in_dims[i];
+      }
+      DequantizeOneScaleQuantAxisN<
+          T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+          in_data, scale_factor, max_range, num, in_dims[quant_axis],
+          quant_stride, out_data);
+    } else if (scale_num == 2) {
+      // Not need to consider quant_axis
+      int num = in->numel();
+      int iter_size = 1;
+      for (int i = 0; i < x_num_col_dims; i++) {
+        iter_size *= in->dims()[i];
+      }
+      int channel = in->dims()[x_num_col_dims];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      int block = 1024;
+      int grid = iter_size * channel;
+      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_one, scale_two, max_range, num, iter_size, channel,
+          out_data);
+    }
+  }
+};
+template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -12,531 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <string>
+#include "paddle/fluid/operators/fake_quantize_op.cu.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-  extern __shared__ char* shared_max_data_tmp[];
-  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
-  if (gridDim.x > 1) {
-    T local_max_data = T(0);
-    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-      T tmp = abs(in[i]);
-      if (tmp > local_max_data) {
-        local_max_data = tmp;
-      }
-    }
-    shared_max_data[tid] = local_max_data;
-  } else {
-    if (bid < n) {
-      shared_max_data[tid] = abs(in[bid]);
-    } else {
-      shared_max_data[tid] = T(0);
-    }
-  }
-  __syncthreads();
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
-      shared_max_data[tid] = shared_max_data[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[blockIdx.x] = shared_max_data[0];
-  }
-}
-template <typename T>
-struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
-                  const int num, T* out) {
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    grid = (grid > block) ? block : grid;
-    framework::Tensor max;
-    T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
-    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
-        in, num, max_data);
-    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
-        max_data, grid, out);
-  }
-};
-template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
-template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
-                                  paddle::platform::float16>;
-template <typename T>
-__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
-                                                  const int c, T* out) {
-  int tid = threadIdx.x;
-  int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  extern __shared__ T shared_max_data[];
-  T local_max_data = T(0);
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    T tmp = fabs(in_c[i]);
-    if (tmp > local_max_data) {
-      local_max_data = tmp;
-    }
-  }
-  shared_max_data[tid] = local_max_data;
-  __syncthreads();
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
-      shared_max_data[tid] = shared_max_data[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[blockIdx.x] = shared_max_data[0];
-  }
-}
-template <typename T>
-__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
-                                                  const int cin, const int cout,
-                                                  T* out) {
-  extern __shared__ T shared_max_data[];
-  int cout_wh_size = n / cin;
-  int wh_size = n / (cin * cout);
-  int tid = threadIdx.x;
-  int bid = blockIdx.x;
-  const T* in_current = in + tid * cout_wh_size + bid * wh_size;
-  T local_max_data = T(0);
-  for (int i = 0; i < wh_size; i++) {
-    T tmp = fabs(in_current[i]);
-    if (tmp > local_max_data) {
-      local_max_data = tmp;
-    }
-  }
-  shared_max_data[tid] = local_max_data;
-  __syncthreads();
-  int len = blockDim.x;
-  for (int i = (len + 1) / 2; i > 0; len = i, i = (i + 1) / 2) {
-    if (tid < i && tid + i < len &&
-        shared_max_data[tid] < shared_max_data[tid + i]) {
-      shared_max_data[tid] = shared_max_data[tid + i];
-    }
-    if (i == 1) {
-      i = 0;  // break the loop
-    }
-    __syncthreads();
-  }
-  if (tid == 0 && shared_max_data[0] > out[bid]) {
-    out[bid] = shared_max_data[0];
-  }
-}
-template <typename T>
-struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_tensor, const int quant_axis,
-                  T* out_abs_max) {
-    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
-    const int num = in_tensor.numel();
-    auto in_dims = in_tensor.dims();
-    const T* in_data = in_tensor.data<T>();
-    if (quant_axis == 0) {
-      int cout = in_dims[0];
-      int grid = cout;
-      int block = 1024;
-      FindChannelAbsMaxKernelQuantAxis0<
-          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-          in_data, num, cout, out_abs_max);
-    } else if (quant_axis == 1) {
-      int cin = in_dims[0];
-      int cout = in_dims[1];
-      int grid = cout;
-      int max_threads = 1024;
-#ifdef PADDLE_WITH_HIP
-      hipMemset(out_abs_max, 0, sizeof(T) * cout);
-#else
-      cudaMemset(out_abs_max, 0, sizeof(T) * cout);
-#endif
-      for (int i = 0; i < cin / max_threads; i++) {
-        int block = max_threads;
-        FindChannelAbsMaxKernelQuantAxis1<
-            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-            in_data, num, cin, cout, out_abs_max);
-        in_data += num / cin;
-      }
-      int block = cin % max_threads;
-      if (block > 0) {
-        FindChannelAbsMaxKernelQuantAxis1<
-            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
-            in_data, num, in_dims[0], in_dims[1], out_abs_max);
-      }
-    }
-  }
-};
-template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
-template <typename T>
-__global__ void ClipAndQuantKernel(const T* in, const T* scale,
-                                   const int bin_cnt, const int n, T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-  T s = scale[0];
-  T inv_s = inverse(s);
-  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out[i] = round(v);
-  }
-}
-template <typename T>
-__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
-                                          const int bin_cnt, const int n,
-                                          T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
-  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    x = x > s ? s : x;
-    x = x < -s ? -s : x;
-    x = bin_cnt_t * inv_s * x;
-    x = static_cast<T>(round(static_cast<float>(x)));
-    out[i] = (x * s) / bin_cnt_t;
-  }
-}
-template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
-template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-// ChannelClipAndQuantKernel for quant_axis is 0
-template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
-                                                    const int bin_cnt,
-                                                    const int64_t n,
-                                                    const int c, T* out) {
-  int tid = threadIdx.x;
-  int64_t channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
-  for (int64_t i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out_c[i] = round(v);
-  }
-}
-// ChannelClipAndQuantKernel for quant_axis is N
-template <typename T>
-__global__ void ChannelClipAndQuantKernelQuantAxisN(
-    const T* in, const T* scale, const int bin_cnt, const int64_t n,
-    const int nScale, const int quant_stride, T* out) {
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
-    T s = scale[(i / quant_stride) % nScale];
-    T inv_s = 1.0 / s;
-    T x = in[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out[i] = round(v);
-  }
-}
-template <typename T>
-struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int quant_axis,
-                  framework::Tensor* out) {
-    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
-    int64_t num = in.numel();
-    auto in_dims = in.dims();
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    if (quant_axis == 0) {
-      int grid = in_dims[0];
-      int block = 1024;
-      ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
-    } else {
-      int quant_stride = 1;
-      for (int i = quant_axis + 1; i < in_dims.size(); i++) {
-        quant_stride *= in_dims[i];
-      }
-      int64_t block_size =
-          std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
-      int64_t max_threads =
-          ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
-      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
-                                          static_cast<int64_t>(1));
-      const int64_t grid_size =
-          std::min(max_blocks, (num + block_size - 1) / block_size);
-      ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
-          out_data);
-    }
-  }
-};
-template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
-                                               float>;
-template <typename T>
-__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
-                                            const T* last_scale,
-                                            const int64_t* iter,
-                                            const int window_size, T* scale_arr,
-                                            T* out_scale, int* need_find_max,
-                                            int* out_size) {
-  int it = iter[0];
-  int idx = it % window_size;
-  T removed = scale_arr[idx];
-  T cur = cur_scale[0];
-  scale_arr[idx] = cur;
-  T max = last_scale[0];
-  out_scale[0] = max < cur ? cur : max;
-  if (fabs(removed - max) < 1e-6) {
-    need_find_max[0] = 1;
-    out_size[0] = it > window_size ? window_size : it;
-  } else {
-    need_find_max[0] = 0;
-  }
-}
-template <typename T>
-struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& cur_scale,
-                  const framework::Tensor& last_scale,
-                  const framework::Tensor& iter, const int window_size,
-                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
-    const auto gpu_place = ctx.GetPlace();
-    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
-    framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
-    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
-    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
-        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
-        window_size, scale_arr, out_scale_data, find_max, out_size_data);
-    int g_find_max;
-    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
-                 sizeof(int), ctx.stream());
-    ctx.Wait();
-    if (g_find_max) {
-      int len;
-      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
-                   sizeof(int), ctx.stream());
-      ctx.Wait();
-      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
-                                                          out_scale_data);
-    }
-  }
-};
-template <typename T>
-__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
-                                              const T* in_accum,
-                                              const T* cur_scale, const T rate,
-                                              T* out_state, T* out_accum,
-                                              T* out_scale) {
-  T state = rate * (*in_state) + T(1.0f);
-  T accum = rate * (*in_accum) + (*cur_scale);
-  *out_state = state;
-  *out_accum = accum;
-  *out_scale = accum / state;
-}
-template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
-template <typename T>
-struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in_accum,
-                  const framework::Tensor& in_state, const T* cur_scale,
-                  const float rate, framework::Tensor* out_state,
-                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
-    const auto gpu_place = ctx.GetPlace();
-    T rate_t = static_cast<T>(rate);
-    T* out_state_data = out_state->mutable_data<T>(gpu_place);
-    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
-    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
-    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
-        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
-        out_state_data, out_accum_data, out_scale_data);
-  }
-};
-// ChannelClipAndQuantDequantKernel for quant_axis is 0
-template <typename T>
-__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
-    const T* in, const T* scale, const int bin_cnt, const int n, const int c,
-    T* out) {
-  int tid = threadIdx.x;
-  int channel_size = n / c;
-  const T* in_c = in + blockIdx.x * channel_size;
-  T* out_c = out + blockIdx.x * channel_size;
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out_c[i] = round(v) * s / bin_cnt;
-  }
-}
-// ChannelClipAndQuantDequantKernel for quant_axis is 1
-template <typename T>
-__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
-    const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
-    const int cout, T* out) {
-  T s = scale[blockIdx.x % cout];
-  T inv_s = inverse(s);
-  int wh_size = n / (cin * cout);
-  const T* in_c = in + blockIdx.x * wh_size;
-  T* out_c = out + blockIdx.x * wh_size;
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out_c[i] = round(v) * s / bin_cnt;
-  }
-}
-template <typename T>
-struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, const int quant_axis,
-                  framework::Tensor* out) {
-    // At present, channelwise quantization supports conv2d, depthwise_conv2d
-    // conv2d_transpose and mul
-    PADDLE_ENFORCE_EQ(
-        quant_axis == 0 || quant_axis == 1, true,
-        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
-                                          "the received is %d",
-                                          quant_axis));
-    int num = in.numel();
-    auto in_dims = in.dims();
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-    if (quant_axis == 0) {
-      int grid = in_dims[0];
-      int block = 1024;
-      ChannelClipAndQuantDequantKernelQuantAxis0<
-          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
-                                               num, in_dims[0], out_data);
-    } else if (quant_axis == 1) {
-      int grid = in_dims[0] * in_dims[1];
-      int block = 1024;
-      ChannelClipAndQuantDequantKernelQuantAxis1<
-          T><<<grid, block, 0, ctx.stream()>>>(
-          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
-    }
-  }
-};
-template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
-                                                   float>;
-}  // namespace operators
-}  // namespace paddle
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;

--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
+#define PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
+#endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
+#include <string>
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+  extern __shared__ char* shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
+  if (gridDim.x > 1) {
+    T local_max_data = T(0);
+    for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+      T tmp = abs(in[i]);
+      if (tmp > local_max_data) {
+        local_max_data = tmp;
+      }
+    }
+    shared_max_data[tid] = local_max_data;
+  } else {
+    if (bid < n) {
+      shared_max_data[tid] = abs(in[bid]);
+    } else {
+      shared_max_data[tid] = T(0);
+    }
+  }
+  __syncthreads();
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+template <typename T>
+struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+    framework::Tensor max;
+    T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
+    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, max_data);
+    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
+        max_data, grid, out);
+  }
+};
+template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
+                                  paddle::platform::float16>;
+template <typename T>
+__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
+                                                  const int c, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  extern __shared__ T shared_max_data[];
+  T local_max_data = T(0);
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T tmp = fabs(in_c[i]);
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
+    }
+  }
+  shared_max_data[tid] = local_max_data;
+  __syncthreads();
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+template <typename T>
+__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
+                                                  const int cin, const int cout,
+                                                  T* out) {
+  extern __shared__ T shared_max_data[];
+  int cout_wh_size = n / cin;
+  int wh_size = n / (cin * cout);
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  const T* in_current = in + tid * cout_wh_size + bid * wh_size;
+  T local_max_data = T(0);
+  for (int i = 0; i < wh_size; i++) {
+    T tmp = fabs(in_current[i]);
+    if (tmp > local_max_data) {
+      local_max_data = tmp;
+    }
+  }
+  shared_max_data[tid] = local_max_data;
+  __syncthreads();
+  int len = blockDim.x;
+  for (int i = (len + 1) / 2; i > 0; len = i, i = (i + 1) / 2) {
+    if (tid < i && tid + i < len &&
+        shared_max_data[tid] < shared_max_data[tid + i]) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    if (i == 1) {
+      i = 0;  // break the loop
+    }
+    __syncthreads();
+  }
+  if (tid == 0 && shared_max_data[0] > out[bid]) {
+    out[bid] = shared_max_data[0];
+  }
+}
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in_tensor, const int quant_axis,
+                  T* out_abs_max) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    const int num = in_tensor.numel();
+    auto in_dims = in_tensor.dims();
+    const T* in_data = in_tensor.data<T>();
+    if (quant_axis == 0) {
+      int cout = in_dims[0];
+      int grid = cout;
+      int block = 1024;
+      FindChannelAbsMaxKernelQuantAxis0<
+          T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+          in_data, num, cout, out_abs_max);
+    } else if (quant_axis == 1) {
+      int cin = in_dims[0];
+      int cout = in_dims[1];
+      int grid = cout;
+      int max_threads = 1024;
+#ifdef PADDLE_WITH_HIP
+      hipMemset(out_abs_max, 0, sizeof(T) * cout);
+#else
+      cudaMemset(out_abs_max, 0, sizeof(T) * cout);
+#endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
+      for (int i = 0; i < cin / max_threads; i++) {
+        int block = max_threads;
+        FindChannelAbsMaxKernelQuantAxis1<
+            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+            in_data, num, cin, cout, out_abs_max);
+        in_data += num / cin;
+      }
+      int block = cin % max_threads;
+      if (block > 0) {
+        FindChannelAbsMaxKernelQuantAxis1<
+            T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
+            in_data, num, in_dims[0], in_dims[1], out_abs_max);
+      }
+    }
+  }
+};
+template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template <typename T>
+__global__ void ClipAndQuantKernel(const T* in, const T* scale,
+                                   const int bin_cnt, const int n, T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+  T s = scale[0];
+  T inv_s = inverse(s);
+  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+    T x = in[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out[i] = round(v);
+  }
+}
+template <typename T>
+__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
+                                          const int bin_cnt, const int n,
+                                          T* out) {
+  int bid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x;
+  T s = scale[0];
+  T inv_s = inverse(s);
+  T bin_cnt_t = static_cast<T>(bin_cnt);
+  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
+    T x = in[i];
+    x = x > s ? s : x;
+    x = x < -s ? -s : x;
+    x = bin_cnt_t * inv_s * x;
+    x = static_cast<T>(round(static_cast<float>(x)));
+    out[i] = (x * s) / bin_cnt_t;
+  }
+}
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+template <typename T>
+struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+// ChannelClipAndQuantKernel for quant_axis is 0
+template <typename T>
+__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
+                                                    const int bin_cnt,
+                                                    const int64_t n,
+                                                    const int c, T* out) {
+  int tid = threadIdx.x;
+  int64_t channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  T s = scale[blockIdx.x];
+  T inv_s = inverse(s);
+  for (int64_t i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v);
+  }
+}
+// ChannelClipAndQuantKernel for quant_axis is N
+template <typename T>
+__global__ void ChannelClipAndQuantKernelQuantAxisN(
+    const T* in, const T* scale, const int bin_cnt, const int64_t n,
+    const int nScale, const int quant_stride, T* out) {
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / quant_stride) % nScale];
+    T inv_s = 1.0 / s;
+    T x = in[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out[i] = round(v);
+  }
+}
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    int64_t num = in.numel();
+    auto in_dims = in.dims();
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
+    } else {
+      int quant_stride = 1;
+      for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+        quant_stride *= in_dims[i];
+      }
+      int64_t block_size =
+          std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
+      int64_t max_threads =
+          ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+      const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
+                                          static_cast<int64_t>(1));
+      const int64_t grid_size =
+          std::min(max_blocks, (num + block_size - 1) / block_size);
+      ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
+          out_data);
+    }
+  }
+};
+template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
+                                               float>;
+template <typename T>
+__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
+                                            const T* last_scale,
+                                            const int64_t* iter,
+                                            const int window_size, T* scale_arr,
+                                            T* out_scale, int* need_find_max,
+                                            int* out_size) {
+  int it = iter[0];
+  int idx = it % window_size;
+  T removed = scale_arr[idx];
+  T cur = cur_scale[0];
+  scale_arr[idx] = cur;
+  T max = last_scale[0];
+  out_scale[0] = max < cur ? cur : max;
+  if (fabs(removed - max) < 1e-6) {
+    need_find_max[0] = 1;
+    out_size[0] = it > window_size ? window_size : it;
+  } else {
+    need_find_max[0] = 0;
+  }
+}
+template <typename T>
+struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    const auto gpu_place = ctx.GetPlace();
+    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+    framework::Tensor need_find_max, out_size;
+    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
+    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
+    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
+        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
+        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+    int g_find_max;
+    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
+                 sizeof(int), ctx.stream());
+    ctx.Wait();
+    if (g_find_max) {
+      int len;
+      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
+                   sizeof(int), ctx.stream());
+      ctx.Wait();
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
+                                                          out_scale_data);
+    }
+  }
+};
+template <typename T>
+__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
+                                              const T* in_accum,
+                                              const T* cur_scale, const T rate,
+                                              T* out_state, T* out_accum,
+                                              T* out_scale) {
+  T state = rate * (*in_state) + T(1.0f);
+  T accum = rate * (*in_accum) + (*cur_scale);
+  *out_state = state;
+  *out_accum = accum;
+  *out_scale = accum / state;
+}
+template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template <typename T>
+struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    const auto gpu_place = ctx.GetPlace();
+    T rate_t = static_cast<T>(rate);
+    T* out_state_data = out_state->mutable_data<T>(gpu_place);
+    T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+    FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
+        in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
+        out_state_data, out_accum_data, out_scale_data);
+  }
+};
+// ChannelClipAndQuantDequantKernel for quant_axis is 0
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int c,
+    T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  T s = scale[blockIdx.x];
+  T inv_s = inverse(s);
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+// ChannelClipAndQuantDequantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
+    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+    int num = in.numel();
+    auto in_dims = in.dims();
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantDequantKernelQuantAxis0<
+          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
+                                               num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+      ChannelClipAndQuantDequantKernelQuantAxis1<
+          T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
+  }
+};
+template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
+                                                   float>;
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/quantize_linear_op.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/clip_kernel_impl.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, const int quant_axis, framework::Tensor* out) {
+    // Dequant op is before quantized op
+    // Dequantize the weight of quantized op
+    auto in_dims = in->dims();
+    const int64_t channel = in_dims[quant_axis];
+    const T* scale_factor = scale->data<T>();
+    if (quant_axis == 0) {
+      for (int64_t i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        framework::Tensor one_channel_in = in->Slice(i, i + 1);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = in_e * s / max_range;
+      }
+    } else if (quant_axis == 1) {
+      int64_t out_iter = 1;
+      for (int i = 0; i < quant_axis; i++) {
+        out_iter *= in_dims[i];
+      }
+      int64_t step_i = in->numel() / out_iter;
+      int64_t step_j = in->numel() / (out_iter * channel);
+      auto* in_data = in->data<T>();
+      auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+      for (int64_t i = 0; i < out_iter; i++) {
+        for (int64_t j = 0; j < channel; j++) {
+          auto* cur_in = in_data + i * step_i + j * step_j;
+          auto* cur_out = out_data + i * step_i + j * step_j;
+          T s = scale_factor[j];
+          for (int64_t k = 0; k < step_j; k++) {
+            *cur_out = (*cur_in) * s / max_range;
+            ++cur_in;
+            ++cur_out;
+          }
+        }
+      }
+    }
+  }
+};
+template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, float>;
+template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, double>;
+class QuantizeLinearOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "QuantizeLinear");
+    OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "QuantizeLinear");
+    OP_INOUT_CHECK(ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint",
+                   "QuantizeLinear");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "QuantizeLinear");
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
+    if (ctx->HasOutput("OutScale")) {
+      if (quant_axis < 0) {
+        ctx->SetOutputDim("OutScale", {1});
+      } else {
+        ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
+      }
+    }
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("Scale", "(Tensor) Input is float data type.");
+    AddInput("ZeroPoint", "(Tensor) Input is float data type.");
+    AddOutput("Y",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current scale").AsDispensable().AsExtra();
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(
+              quant_axis == 0 || quant_axis == 1 || quant_axis == -1, true,
+              platform::errors::InvalidArgument(
+                  "'quant_axis' should be 0 or 1, but "
+                  "the received is %d",
+                  quant_axis));
+        });
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+                            platform::errors::InvalidArgument(
+                                "'bit_length' should be between 1 and 16, but "
+                                "the received is %d",
+                                bit_length));
+        });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+The scale of QuantizeLinear operator is a vector.
+In detail, each channel of the input X has a scale value.
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c})$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(
+    quantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(quantize_linear, ops::QuantizeLinearKernel<CPU, float>);
+REGISTER_OPERATOR(
+    dequantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(dequantize_linear,
+                       ops::DeQuantizeLinearKernel<CPU, float, float>,
+                       ops::DeQuantizeLinearKernel<CPU, int8_t, float>,
+                       ops::DeQuantizeLinearKernel<CPU, double, double>);
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/fake_dequantize_op.cu.h"
+#include "paddle/fluid/operators/fake_quantize_op.cu.h"
+#include "paddle/fluid/operators/quantize_linear_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, const int quant_axis, framework::Tensor* out) {
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    int64_t num = in->numel();
+    const T* scale_factor = scale->data<T>();
+    int64_t block_size = std::min(
+        num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
+    int64_t max_threads =
+        dev_ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (num + block_size - 1) / block_size);
+    int quant_stride = 1;
+    for (int i = quant_axis + 1; i < in_dims.size(); i++) {
+      quant_stride *= in_dims[i];
+    }
+    DequantizeOneScaleQuantAxisN<
+        T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+        in_data, scale_factor, max_range, num, in_dims[quant_axis],
+        quant_stride, out_data);
+  }
+};
+template struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(dequantize_linear,
+                        ops::DeQuantizeLinearKernel<CUDA, float, float>,
+                        ops::DeQuantizeLinearKernel<CUDA, int8_t, float>,
+                        ops::DeQuantizeLinearKernel<CUDA, double, double>);
+REGISTER_OP_CUDA_KERNEL(quantize_linear,
+                        ops::QuantizeLinearKernel<CUDA, float>);
--- a/paddle/fluid/operators/quantize_linear_op.h
+++ b/paddle/fluid/operators/quantize_linear_op.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+struct ChannelDequantizeFunctorV2 {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor** scales, const int scale_num,
+                  T max_range, const int quant_axis, framework::Tensor* out);
+};
+template <typename DeviceContext, typename T>
+class QuantizeLinearKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* in_scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Y");
+    out->mutable_data<T>(context.GetPlace());
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
+    bool is_test = context.Attr<bool>("is_test");
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (quant_axis < 0) {
+      if (!is_test) {
+        auto* out_scale = context.Output<framework::Tensor>("OutScale");
+        T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+        FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(),
+                                              in->numel(), out_s);
+        ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                    bin_cnt, out);
+      } else {
+        ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                    bin_cnt, out);
+      }
+    } else {
+      if (!is_test) {
+        auto* out_scale = context.Output<framework::Tensor>("OutScale");
+        T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+        FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                     out_scale_data);
+        ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+            dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
+      } else {
+        ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+            dev_ctx, *in, *in_scale, bin_cnt, quant_axis, out);
+      }
+    }
+  }
+};
+template <typename DeviceContext, typename T, typename D>
+class DeQuantizeLinearKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto* in = context.Input<framework::Tensor>("X");
+    auto in_tmp = phi::Cast<T>(
+        static_cast<const typename paddle::framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *in, experimental::CppTypeToDataType<D>::Type());
+    auto* scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Y");
+    int bit_length = context.Attr<int>("bit_length");
+    auto quant_axis = context.Attr<int>("quant_axis");
+    out->mutable_data<D>(dev_ctx.GetPlace());
+    if (quant_axis < 0) {
+      float max_range = (std::pow(2, bit_length - 1) - 1);
+      DequantizeFunctor<DeviceContext, D>()(dev_ctx, &in_tmp, scale,
+                                            static_cast<D>(max_range), out);
+    } else {
+      PADDLE_ENFORCE_EQ(
+          scale->numel(), in_tmp.dims()[quant_axis],
+          platform::errors::PreconditionNotMet(
+              "The number of first scale values must be the same with "
+              "quant_axis dimension value of Input(X) when the `scale` has "
+              "only one element, but %ld != %ld here.",
+              scale->numel(), in_tmp.dims()[quant_axis]));
+      int max_range = (std::pow(2, bit_length - 1) - 1);
+      ChannelDequantizeFunctorV2<DeviceContext, D>()(
+          dev_ctx, &in_tmp, scale, static_cast<D>(max_range), quant_axis, out);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -41,6 +41,7 @@ PD_REGISTER_KERNEL(cast,
                   int64_t,
                   int16_t,
                   bool,
+                   int8_t,
                   uint8_t,
                   phi::dtype::float16,
                   phi::dtype::bfloat16,

--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -41,6 +41,7 @@ void CastKernel(const Context& dev_ctx,
                     int64_t,                           \
                     int16_t,                           \
                     bool,                              \
+                     int8_t,                            \
                     uint8_t,                           \
                     phi::dtype::float16,               \
                     phi::dtype::complex<float>,        \

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -28,6 +28,7 @@ from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.io import load_inference_model, save_inference_model
+from ..quantization_pass import ReplaceFakeQuantDequantPass, QuantWeightPass
 from paddle.fluid.log_helper import get_logger
 from .. import quantization_pass
 from . import utils
@@ -431,7 +432,12 @@ class ImperativeQuantizeOutputs(object):
            setattr(parent_layer, sub_name, cur_quant_layer)
-    def save_quantized_model(self, model, path, input_spec=None, **config):
+    def save_quantized_model(self,
+                             model,
+                             path,
+                             input_spec=None,
+                             onnx_format=False,
+                             **config):
        """
        Save the quantized model for the inference.
@@ -444,6 +450,8 @@ class ImperativeQuantizeOutputs(object):
                InputSpec or example Tensor. If None, all input variables of 
                the original Layer's forward method would be the inputs of
                the saved model. Default None.
+            onnx_format (bool, optional): Whether to export the quantized model 
+                with format of ONNX. Default is False.
            **configs (dict, optional): Other save configuration options for
                compatibility. We do not recommend using these configurations,
                they may be removed in the future. If not necessary, DO NOT use
@@ -498,6 +506,18 @@ class ImperativeQuantizeOutputs(object):
        self._set_skip_quant_attr(infer_program)
+        clip_extra = False
+        if onnx_format:
+            graph = IrGraph(core.Graph(infer_program.desc), for_test=False)
+            transform_pass = ReplaceFakeQuantDequantPass(scope, place)
+            transform_pass.apply(graph)
+            quant_weight_pass = QuantWeightPass(scope, place)
+            quant_weight_pass.apply(graph)
+            infer_program = graph.to_program()
+            clip_extra = True
        save_inference_model(
            dirname=dirname,
            feeded_var_names=feed_target_names,
@@ -506,7 +526,7 @@ class ImperativeQuantizeOutputs(object):
            main_program=infer_program.clone(),
            model_filename=model_filename,
            params_filename=params_filename,
-            clip_extra=False)
+            clip_extra=clip_extra)
        if is_dynamic_mode:
            paddle.disable_static()

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -18,10 +18,7 @@ import numpy as np
 import paddle
 import paddle.nn.quant.quant_layers as quant_layers
-from ..quantization_pass import _get_op_input_var_names
+from ..utils import _get_op_input_var_names, _get_op_output_var_names, _get_output_name_index, _get_input_name_index
-from ..quantization_pass import _get_op_output_var_names
-from ..quantization_pass import _get_output_name_index
-from ..quantization_pass import _get_input_name_index
 layer_name_map = {
    'Conv2DTranspose': paddle.nn.Conv2DTranspose,

--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -25,18 +25,10 @@ from .... import unique_name
 from ....executor import global_scope, Executor
 from ....framework import IrGraph
 from ....log_helper import get_logger
-from .quantization_pass import QuantizationTransformPass
+from .quantization_pass import QuantizationTransformPass, QuantizationTransformPassV2, QuantizationFreezePass, QuantWeightPass, AddQuantDequantPass, AddQuantDequantPassV2
-from .quantization_pass import QuantizationFreezePass
-from .quantization_pass import AddQuantDequantPass
-from .quantization_pass import _out_scale_op_list
-from .quantization_pass import _get_op_input_var_names
-from .quantization_pass import _get_op_output_var_names
-from .quantization_pass import _get_output_name_index
-from .quantization_pass import _get_input_name_index
-from .quantization_pass import _channelwise_quant_axis1_ops
 from .cal_kl_threshold import cal_kl_threshold
 from .adaround import run_adaround
-from .utils import load_variable_data, set_variable_data
+from . import utils
 __all__ = ['PostTrainingQuantization', 'WeightQuantization']
@@ -131,6 +123,7 @@ class PostTrainingQuantization(object):
                 weight_bits=8,
                 activation_quantize_type='range_abs_max',
                 weight_quantize_type='channel_wise_abs_max',
+                 onnx_format=False,
                 optimize_model=False,
                 is_use_cache_file=False,
                 cache_dir=None):
@@ -203,6 +196,8 @@ class PostTrainingQuantization(object):
                the fake ops in saving quantized model, and we save the scale obtained
                by post training quantization in fake ops. Compared to 'abs_max',
                the model accuracy is usually higher when it is 'channel_wise_abs_max'.
+            onnx_format(bool): Whether to export the quantized model with format of ONNX.
+                Default is False.
            optimize_model(bool, optional): If set optimize_model as True, it applies
                some passes to the model before quantization, and it supports
                `conv2d/depthwise_conv2d + bn` pass so far. Some targets require the
@@ -265,8 +260,8 @@ class PostTrainingQuantization(object):
        self._learning_rate = learning_rate
        self._dynamic_quantize_op_type = ['lstm']
        self._support_quantize_op_type = \
-            list(set(QuantizationTransformPass._supported_quantizable_op_type +
+            list(set(utils._weight_supported_quantizable_op_type +
-                AddQuantDequantPass._supported_quantizable_op_type +
+                utils._act_supported_quantizable_op_type +
                self._dynamic_quantize_op_type))
        # Check inputs
@@ -305,6 +300,7 @@ class PostTrainingQuantization(object):
        self._weight_bits = weight_bits
        self._activation_quantize_type = activation_quantize_type
        self._weight_quantize_type = weight_quantize_type
+        self._onnx_format = onnx_format
        self._is_full_quantize = is_full_quantize
        if is_full_quantize:
            self._quantizable_op_type = self._support_quantize_op_type
@@ -322,7 +318,7 @@ class PostTrainingQuantization(object):
        self._fetch_list = None
        self._data_loader = data_loader
-        self._out_scale_op_list = _out_scale_op_list
+        self._out_scale_op_list = utils._out_scale_op_list
        self._quantized_weight_var_name = set()
        self._quantized_act_var_name = set()
        self._weight_op_pairs = {}
@@ -391,22 +387,27 @@ class PostTrainingQuantization(object):
                break
        _logger.info("Finish sampling stage, all batch: " + str(batch_id))
-        if self._round_type == 'adaround':
-            self._adaround_apply()
-        self._reset_activation_persistable()
        if self._algo == 'avg':
            for var_name in self._quantized_act_var_name:
                self._quantized_threshold[var_name] = \
                np.array(self._quantized_var_avg[var_name]).mean()
        if self._algo in ["KL", "hist"]:
            self._calculate_kl_hist_threshold()
-        if self._algo in ["KL", "abs_max", "hist", "avg", "mse", "emd"]:
-            self._update_program()
+        if self._round_type == 'adaround':
-        else:
+            self._adaround_apply()
+        self._reset_activation_persistable()
+        if self._algo is 'min_max':
            self._save_input_threhold()
+        else:
+            self._update_program()
+        # save out_threshold for quantized ops.
+        if not self._onnx_format:
+            self._save_output_threshold()
-        self._save_output_threshold()
        if any(op_type in self._quantizable_op_type
               for op_type in self._dynamic_quantize_op_type):
            self._collect_dynamic_quantize_op_threshold(
@@ -431,6 +432,7 @@ class PostTrainingQuantization(object):
        return self._program
    def _adaround_apply(self):
+        assert self._algo != "min_max", "The algo should not be min_max."
        if self._algo in ["KL", "hist"]:
            scale_dict = self._quantized_var_threshold
        else:
@@ -466,6 +468,7 @@ class PostTrainingQuantization(object):
        Returns:
            None
        '''
+        clip_extra = True if self._onnx_format else False
        io.save_inference_model(
            dirname=save_model_path,
            model_filename=model_filename,
@@ -473,7 +476,8 @@ class PostTrainingQuantization(object):
            feeded_var_names=self._feed_list,
            target_vars=self._fetch_list,
            executor=self._executor,
-            main_program=self._program)
+            main_program=self._program,
+            clip_extra=clip_extra)
        _logger.info("The quantized model is saved in " + save_model_path)
    def _load_model_data(self):
@@ -551,22 +555,22 @@ class PostTrainingQuantization(object):
                # For quantized ops, sample inputs and outputs
                if op_type in self._quantizable_op_type:
                    collect_var_name(
-                        _get_op_input_var_names(op), persistable_var_names,
+                        utils._get_op_input_var_names(op),
-                        op_type)
+                        persistable_var_names, op_type)
                    collect_var_name(
-                        _get_op_output_var_names(op), persistable_var_names,
+                        utils._get_op_output_var_names(op),
-                        op_type)
+                        persistable_var_names, op_type)
                    # collect quanted op output var name
-                    for out_var_name in _get_op_output_var_names(op):
+                    for out_var_name in utils._get_op_output_var_names(op):
-                        for in_var_name in _get_op_input_var_names(op):
+                        for in_var_name in utils._get_op_input_var_names(op):
                            if in_var_name in persistable_var_names:
                                self._quantized_op_pairs[
                                    in_var_name] = out_var_name
                # For other op, only sample output scale
                elif op_type in self._out_scale_op_list:
                    collect_var_name(
-                        _get_op_output_var_names(op), persistable_var_names,
+                        utils._get_op_output_var_names(op),
-                        op_type)
+                        persistable_var_names, op_type)
    def _set_activation_persistable(self):
        '''
@@ -608,13 +612,13 @@ class PostTrainingQuantization(object):
    def _sample_mse(self):
        if self._quantized_threshold == {}:
            for var_name in self._quantized_weight_var_name:
-                var_tensor = load_variable_data(self._scope, var_name)
+                var_tensor = utils.load_variable_data(self._scope, var_name)
                if self._weight_quantize_type == "abs_max":
                    abs_max_value = float(np.max(np.abs(var_tensor)))
                elif self._weight_quantize_type == "channel_wise_abs_max":
                    abs_max_value = []
                    if self._weight_op_pairs[
-                            var_name] in _channelwise_quant_axis1_ops:
+                            var_name] in utils._channelwise_quant_axis1_ops:
                        for i in range(var_tensor.shape[1]):
                            abs_max_value.append(
                                float(np.max(np.abs(var_tensor[:, i]))))
@@ -625,7 +629,7 @@ class PostTrainingQuantization(object):
                self._quantized_threshold[var_name] = abs_max_value
        _logger.info("MSE searching stage ...")
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            var_tensor = var_tensor.flatten()
            abs_max_value = float(np.max(np.abs(var_tensor)))
            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
@@ -647,13 +651,13 @@ class PostTrainingQuantization(object):
    def _sample_emd(self):
        if self._quantized_threshold == {}:
            for var_name in self._quantized_weight_var_name:
-                var_tensor = load_variable_data(self._scope, var_name)
+                var_tensor = utils.load_variable_data(self._scope, var_name)
                if self._weight_quantize_type == "abs_max":
                    abs_max_value = float(np.max(np.abs(var_tensor)))
                elif self._weight_quantize_type == "channel_wise_abs_max":
                    abs_max_value = []
                    if self._weight_op_pairs[
-                            var_name] in _channelwise_quant_axis1_ops:
+                            var_name] in utils._channelwise_quant_axis1_ops:
                        for i in range(var_tensor.shape[1]):
                            abs_max_value.append(
                                float(np.max(np.abs(var_tensor[:, i]))))
@@ -664,7 +668,7 @@ class PostTrainingQuantization(object):
                self._quantized_threshold[var_name] = abs_max_value
        _logger.info("EMD searching stage ...")
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            var_tensor = var_tensor.flatten()
            abs_max_value = float(np.max(np.abs(var_tensor)))
            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
@@ -688,13 +692,13 @@ class PostTrainingQuantization(object):
    def _sample_avg(self):
        if self._quantized_threshold == {}:
            for var_name in self._quantized_weight_var_name:
-                var_tensor = load_variable_data(self._scope, var_name)
+                var_tensor = utils.load_variable_data(self._scope, var_name)
                if self._weight_quantize_type == "abs_max":
                    abs_max_value = float(np.max(np.abs(var_tensor)))
                elif self._weight_quantize_type == "channel_wise_abs_max":
                    abs_max_value = []
                    if self._weight_op_pairs[
-                            var_name] in _channelwise_quant_axis1_ops:
+                            var_name] in utils._channelwise_quant_axis1_ops:
                        for i in range(var_tensor.shape[1]):
                            abs_max_value.append(
                                float(np.max(np.abs(var_tensor[:, i]))))
@@ -705,7 +709,7 @@ class PostTrainingQuantization(object):
                self._quantized_threshold[var_name] = abs_max_value
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            abs_max_value = float(np.max(np.abs(var_tensor)))
            if (var_name not in self._quantized_var_avg):
                self._quantized_var_avg[var_name] = []
@@ -717,13 +721,13 @@ class PostTrainingQuantization(object):
    def _sample_abs_max(self):
        if self._quantized_threshold == {}:
            for var_name in self._quantized_weight_var_name:
-                var_tensor = load_variable_data(self._scope, var_name)
+                var_tensor = utils.load_variable_data(self._scope, var_name)
                if self._weight_quantize_type == "abs_max":
                    abs_max_value = float(np.max(np.abs(var_tensor)))
                elif self._weight_quantize_type == "channel_wise_abs_max":
                    abs_max_value = []
                    if self._weight_op_pairs[
-                            var_name] in _channelwise_quant_axis1_ops:
+                            var_name] in utils._channelwise_quant_axis1_ops:
                        for i in range(var_tensor.shape[1]):
                            abs_max_value.append(
                                float(np.max(np.abs(var_tensor[:, i]))))
@@ -734,7 +738,7 @@ class PostTrainingQuantization(object):
                self._quantized_threshold[var_name] = abs_max_value
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            abs_max_value = float(np.max(np.abs(var_tensor)))
            if (var_name not in self._quantized_threshold) or \
                (abs_max_value > self._quantized_threshold[var_name]):
@@ -743,7 +747,7 @@ class PostTrainingQuantization(object):
    def _sample_min_max(self):
        if self._quantized_var_min == {} and self._quantized_var_max == {}:
            for var_name in self._quantized_weight_var_name:
-                var_tensor = load_variable_data(self._scope, var_name)
+                var_tensor = utils.load_variable_data(self._scope, var_name)
                if self._weight_quantize_type == "abs_max":
                    min_value = float(np.min(var_tensor))
                    max_value = float(np.max(var_tensor))
@@ -751,7 +755,7 @@ class PostTrainingQuantization(object):
                    min_value = []
                    max_value = []
                    if self._weight_op_pairs[
-                            var_name] in _channelwise_quant_axis1_ops:
+                            var_name] in utils._channelwise_quant_axis1_ops:
                        for i in range(var_tensor.shape[1]):
                            min_value.append(float(np.min(var_tensor[:, i])))
                            max_value.append(float(np.max(var_tensor[:, i])))
@@ -763,7 +767,7 @@ class PostTrainingQuantization(object):
                self._quantized_var_max[var_name] = max_value
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            min_value = float(np.min(var_tensor))
            max_value = float(np.max(var_tensor))
            if (var_name not in self._quantized_var_min) or \
@@ -775,7 +779,7 @@ class PostTrainingQuantization(object):
    def _sample_histogram(self):
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            var_tensor_abs = np.abs(var_tensor)
            bins = self._sampling_act_histogram[var_name][1]
            hist, _ = np.histogram(var_tensor_abs, bins=bins)
@@ -790,7 +794,7 @@ class PostTrainingQuantization(object):
        for block_id in range(len(self._program.blocks)):
            for op in self._program.blocks[block_id].ops:
                if op.type in self._quantizable_op_type:
-                    for var_name in _get_op_input_var_names(op):
+                    for var_name in utils._get_op_input_var_names(op):
                        assert var_name in self._quantized_var_min
                        assert var_name in self._quantized_var_max
                        op._set_attr(var_name + ".min",
@@ -805,7 +809,7 @@ class PostTrainingQuantization(object):
        get the min and max value, and then calculate the threshold.
        '''
        for var_name in self._quantized_act_var_name:
-            var_tensor = load_variable_data(self._scope, var_name)
+            var_tensor = utils.load_variable_data(self._scope, var_name)
            var_tensor = np.abs(var_tensor)
            min_value = float(np.min(var_tensor))
            max_value = float(np.max(var_tensor))
@@ -839,13 +843,13 @@ class PostTrainingQuantization(object):
        # Abs_max threshold for weights
        for var_name in self._quantized_weight_var_name:
-            weight_data = load_variable_data(self._scope, var_name)
+            weight_data = utils.load_variable_data(self._scope, var_name)
            if self._weight_quantize_type == "abs_max":
                weight_threshold = float(np.max(np.abs(weight_data)))
            elif self._weight_quantize_type == "channel_wise_abs_max":
                weight_threshold = []
                if self._weight_op_pairs[
-                        var_name] in _channelwise_quant_axis1_ops:
+                        var_name] in utils._channelwise_quant_axis1_ops:
                    for i in range(weight_data.shape[1]):
                        weight_threshold.append(
                            float(np.max(np.abs(weight_data[:, i]))))
@@ -876,17 +880,27 @@ class PostTrainingQuantization(object):
        # use QuantizationTransformPass to insert fake_quant/fake_dequantize op
        major_quantizable_op_types = []
-        for op_type in QuantizationTransformPass._supported_quantizable_op_type:
+        for op_type in utils._weight_supported_quantizable_op_type:
            if op_type in self._quantizable_op_type:
                major_quantizable_op_types.append(op_type)
-        transform_pass = QuantizationTransformPass(
+        if not self._onnx_format:
-            scope=self._scope,
+            transform_pass = QuantizationTransformPass(
-            place=self._place,
+                scope=self._scope,
-            weight_bits=self._weight_bits,
+                place=self._place,
-            activation_bits=self._activation_bits,
+                weight_bits=self._weight_bits,
-            activation_quantize_type=self._activation_quantize_type,
+                activation_bits=self._activation_bits,
-            weight_quantize_type=self._weight_quantize_type,
+                activation_quantize_type=self._activation_quantize_type,
-            quantizable_op_type=major_quantizable_op_types)
+                weight_quantize_type=self._weight_quantize_type,
+                quantizable_op_type=major_quantizable_op_types)
+        else:
+            transform_pass = QuantizationTransformPassV2(
+                scope=self._scope,
+                place=self._place,
+                weight_bits=self._weight_bits,
+                activation_bits=self._activation_bits,
+                activation_quantize_type=self._activation_quantize_type,
+                weight_quantize_type=self._weight_quantize_type,
+                quantizable_op_type=major_quantizable_op_types)
        for sub_graph in graph.all_sub_graphs():
            # Insert fake_quant/fake_dequantize op must in test graph, so
@@ -896,13 +910,20 @@ class PostTrainingQuantization(object):
        # use AddQuantDequantPass to insert fake_quant_dequant op
        minor_quantizable_op_types = []
-        for op_type in AddQuantDequantPass._supported_quantizable_op_type:
+        for op_type in utils._act_supported_quantizable_op_type:
            if op_type in self._quantizable_op_type:
                minor_quantizable_op_types.append(op_type)
-        add_quant_dequant_pass = AddQuantDequantPass(
+        if not self._onnx_format:
-            scope=self._scope,
+            add_quant_dequant_pass = AddQuantDequantPass(
-            place=self._place,
+                scope=self._scope,
-            quantizable_op_type=minor_quantizable_op_types)
+                place=self._place,
+                quantizable_op_type=minor_quantizable_op_types)
+        else:
+            add_quant_dequant_pass = AddQuantDequantPassV2(
+                scope=self._scope,
+                place=self._place,
+                quantizable_op_type=minor_quantizable_op_types,
+                is_full_quantized=self._is_full_quantize)
        for sub_graph in graph.all_sub_graphs():
            sub_graph._for_test = True
@@ -914,33 +935,39 @@ class PostTrainingQuantization(object):
        else:
            scale_dict = self._quantized_threshold
        for key, val in scale_dict.items():
-            set_variable_data(
+            utils.set_variable_data(
                self._scope,
                self._place,
                key + ".scale",
                np.array(
                    [val], dtype=np.float32))
-            set_variable_data(
+            utils.set_variable_data(
                self._scope,
                self._place,
                key + ".quant_dequant.scale",
                np.array(
                    [val], dtype=np.float32))
-        # apply QuantizationFreezePass, and obtain the final quant model
+        if not self._onnx_format:
-        freeze_pass = QuantizationFreezePass(
+            # apply QuantizationFreezePass, and obtain the final quant model
-            scope=self._scope,
+            freeze_pass = QuantizationFreezePass(
-            place=self._place,
+                scope=self._scope,
-            bias_correction=self._bias_correction,
+                place=self._place,
-            weight_bits=self._weight_bits,
+                bias_correction=self._bias_correction,
-            round_type=self._round_type,
+                weight_bits=self._weight_bits,
-            activation_bits=self._activation_bits,
+                round_type=self._round_type,
-            weight_quantize_type=self._weight_quantize_type,
+                activation_bits=self._activation_bits,
-            quantizable_op_type=major_quantizable_op_types)
+                weight_quantize_type=self._weight_quantize_type,
+                quantizable_op_type=major_quantizable_op_types)
-        for sub_graph in graph.all_sub_graphs():
-            sub_graph._for_test = True
+            for sub_graph in graph.all_sub_graphs():
-            freeze_pass.apply(sub_graph)
+                sub_graph._for_test = True
+                freeze_pass.apply(sub_graph)
+        else:
+            quant_weight_pass = QuantWeightPass(self._scope, self._place)
+            for sub_graph in graph.all_sub_graphs():
+                sub_graph._for_test = True
+                quant_weight_pass.apply(sub_graph)
        self._program = graph.to_program()
@@ -960,7 +987,7 @@ class PostTrainingQuantization(object):
                op._set_attr("quantization_type", quantized_type)
        def analysis_and_save_info(op_node, out_var_name):
-            argname_index = _get_output_name_index(op_node, out_var_name)
+            argname_index = utils._get_output_name_index(op_node, out_var_name)
            assert argname_index is not None, \
                out_var_name + " is not the output of the op"
            if self._algo == "KL":
@@ -997,7 +1024,7 @@ class PostTrainingQuantization(object):
            for op in self._program.blocks[block_id].ops:
                if op.type in (
                        self._quantizable_op_type + self._out_scale_op_list):
-                    out_var_names = _get_op_output_var_names(op)
+                    out_var_names = utils._get_op_output_var_names(op)
                    for var_name in out_var_names:
                        analysis_and_save_info(op, var_name)
@@ -1020,11 +1047,11 @@ class PostTrainingQuantization(object):
        quantization_type = str("post_" + self._algo).lower()
        persistable_var_names = _all_persistable_var_names(self._program)
        for op in target_ops:
-            for var_name in _get_op_input_var_names(op):
+            for var_name in utils._get_op_input_var_names(op):
                if var_name in persistable_var_names:
-                    var_data = load_variable_data(self._scope, var_name)
+                    var_data = utils.load_variable_data(self._scope, var_name)
                    threshold = float(np.max(np.abs(var_data)))
-                    argname, index = _get_input_name_index(op, var_name)
+                    argname, index = utils._get_input_name_index(op, var_name)
                    op._set_attr(argname + str(index) + "_threshold", threshold)
                    op._set_attr("quantization_type", quantization_type)
                    op._set_attr("bit_length", self._weight_bits)
@@ -1268,7 +1295,7 @@ class WeightQuantization(object):
        save_weight_dtype = np.int8 if weight_bits == 8 else np.int16
        # Get quantized scale and weight data
-        weight_data = load_variable_data(scope, var_name)
+        weight_data = utils.load_variable_data(scope, var_name)
        if abs(threshold_rate) < 1e-10:
            threshold_value = np.max(np.abs(weight_data))
        else:
@@ -1282,11 +1309,13 @@ class WeightQuantization(object):
        # Set weight data
        if not for_test:
-            set_variable_data(scope, place, var_name, quantized_weight_data)
+            utils.set_variable_data(scope, place, var_name,
+                                    quantized_weight_data)
        else:
            dequantized_weight_data = \
                (quantized_weight_data * scale).astype(np.float32)
-            set_variable_data(scope, place, var_name, dequantized_weight_data)
+            utils.set_variable_data(scope, place, var_name,
+                                    dequantized_weight_data)
        # Save info
        op._set_attr('quantization_type', 'post_weight_abs_max')
@@ -1303,7 +1332,7 @@ class WeightQuantization(object):
        save_weight_dtype = np.int8 if weight_bits == 8 else np.int16
        # Get quantized scale and weight data
-        weight_data = load_variable_data(scope, var_name)
+        weight_data = utils.load_variable_data(scope, var_name)
        if op.type == "mul":
            scales, quantized_weight_data = \
                self._mul_channel_wise_quantization(weight_data,
@@ -1317,7 +1346,8 @@ class WeightQuantization(object):
        # Set weight data
        if not for_test:
-            set_variable_data(scope, place, var_name, quantized_weight_data)
+            utils.set_variable_data(scope, place, var_name,
+                                    quantized_weight_data)
        else:
            if op.type == "mul":
                dequantized_weight_data = \
@@ -1328,7 +1358,8 @@ class WeightQuantization(object):
            else:
                _logger.error(op.type +
                              " is not supported by weight quantization")
-            set_variable_data(scope, place, var_name, dequantized_weight_data)
+            utils.set_variable_data(scope, place, var_name,
+                                    dequantized_weight_data)
        # Save info
        op._set_attr('quantization_type', 'post_weight_channel_wise_abs_max')

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -26,12 +26,20 @@ from ....data import data
 from ....layers import mean
 from ....executor import scope_guard
 from ....framework import _get_paddle_place
-from .utils import _channelwise_quant_axis1_ops, quant_tensor
+from . import utils
 __all__ = [
-    'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass',
+    'QuantizationTransformPass',
-    'TransformForMobilePass', 'OutScaleForTrainingPass',
+    'QuantizationFreezePass',
-    'OutScaleForInferencePass', 'AddQuantDequantPass'
+    'ConvertToInt8Pass',
+    'TransformForMobilePass',
+    'OutScaleForTrainingPass',
+    'OutScaleForInferencePass',
+    'AddQuantDequantPass',
+    'QuantizationTransformPassV2',
+    'AddQuantDequantPassV2',
+    'ReplaceFakeQuantDequantPass',
+    'QuantWeightPass',
 ]
 _fake_quant_op_list = [
@@ -44,278 +52,13 @@ _fake_dequant_op_list = [
 ]
 _fake_quant_dequant_op_list = [
-    'fake_quantize_dequantize_moving_average_abs_max'
+    'fake_quantize_dequantize_moving_average_abs_max',
+    "fake_channel_wise_quantize_dequantize_abs_max",
 ]
-_out_scale_op_list = [
-    "conv2d",
-    "depthwise_conv2d",
-    "mul",
-    "matmul",
-    "matmul_v2",
-    "relu",
-    "leaky_relu",
-    "relu6",
-    "sigmoid",
-    "tanh",
-    "prelu",
-    "swish",
-    "dropout",
-    "softmax",
-    "batch_norm",
-    "layer_norm",
-    "elementwise_add",
-    "pool2d",
-    "reshape2",
-    "transpose2",
-    "concat",
-    "elementwise_mul",
-    "elementwise_pow",
-    "elementwise_sub",
-    "scale",
-    "slice",
-    "hard_swish",
-    "hard_sigmoid",
-    "conv2d_transpose",
-    "gru",
-    "bilinear_interp",
-    "nearest_interp",
-    "trilinear_interp",
-    "flatten",
-    "flatten2",
-    "transpose",
-    "pad2d",
-    "pad3d",
-    "reshape",
-    "split",
-    "flatten_contiguous_range",
-    "squeeze",
-    "squeeze2",
-    "nearest_interp_v2",
-    "fill_constant_batch_size_like",
-    "bilinear_interp",
-    "bilinear_interp_v2",
-    "arg_max",
-    "abs",
-    "assign",
-    "cast",
-    "clip",
-    "box_coder",
-    "crop",
-    "cumsum",
-    "equal",
-    "expand_v2",
-    "fill_any_like",
-    "fill_constant",
-    "gelu",
-    "instance_norm",
-    "lookup_table",
-    "lookup_table_v2",
-    "norm",
-    "p_norm",
-    "pow",
-    "reduce_mean",
-    "stack",
-    "top_k_v2",
-    "unsqueeze",
-    "unsqueeze2",
-    "logical_and",
-    "logical_not",
-    "meshgrid",
-    "roi_align",
-    "strided_slice",
-    "where",
-    "grid_sampler",
-    "tile",
-    "group_norm",
-    "reduce_sum",
-    "square",
-    "softplus",
-    "gather",
-    "shuffle_channel",
-]
-# list op real input and output names, to avoid processing input such as AxisTensor.
-_op_real_in_out_name = {
-    "conv2d": [["Input", "Filter"], ["Output"]],
-    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
-    "conv2d_transpose": [["Input", "Filter"], ["Output"]],
-    "mul": [["X", "Y"], ["Out"]],
-    "matmul": [["X", "Y"], ["Out"]],
-    "matmul_v2": [["X", "Y"], ["Out"]],
-    "pool2d": [["X"], ["Out"]],
-    "elementwise_add": [["X", "Y"], ["Out"]],
-    "concat": [["X"], ["Out"]],
-    "softmax": [["X"], ["Out"]],
-    "argmax": [["X"], ["Out"]],
-    "transpose": [["X"], ["Out"]],
-    "equal": [["X", "Y"], ["Out"]],
-    "gather": [["X"], ["Out"]],
-    "greater_equal": [["X", "Y"], ["Out"]],
-    "greater_than": [["X", "Y"], ["Out"]],
-    "less_equal": [["X", "Y"], ["Out"]],
-    "less_than": [["X", "Y"], ["Out"]],
-    "mean": [["X"], ["Out"]],
-    "not_equal": [["X", "Y"], ["Out"]],
-    "reshape": [["X"], ["Out"]],
-    "reshape2": [["X"], ["Out"]],
-    "transpose2": [["X"], ["Out"]],
-    "bilinear_interp": [["X"], ["Out"]],
-    "nearest_interp": [["X"], ["Out"]],
-    "trilinear_interp": [["X"], ["Out"]],
-    "slice": [["Input"], ["Out"]],
-    "squeeze": [["X"], ["Out"]],
-    "elementwise_sub": [["X", "Y"], ["Out"]],
-    "relu": [["X"], ["Out"]],
-    "relu6": [["X"], ["Out"]],
-    "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X", "Alpha"], ["Out"]],
-    "tanh": [["X"], ["Out"]],
-    "swish": [["X"], ["Out"]],
-    "dropout": [["X"], ["Out"]],
-    "batch_norm": [["X"], ["Y"]],
-    "layer_norm": [["X"], ["Y"]],
-    "sigmoid": [["X"], ["Out"]],
-    "elementwise_mul": [["X", "Y"], ["Out"]],
-    "elementwise_pow": [["X", "Y"], ["Out"]],
-    "scale": [["X"], ["Out"]],
-    "hard_swish": [["X"], ["Out"]],
-    "hard_sigmoid": [["X"], ["Out"]],
-    "gru": [["Input", "Weight"], ["Hidden"]],
-    "lstm": [["Input", "Weight"], ["Hidden"]],
-    "pad2d": [["X"], ["Out"]],
-    "pad3d": [["X"], ["Out"]],
-    "flatten": [["X"], ["Out"]],
-    "flatten2": [["X"], ["Out"]],
-    "unsqueeze2": [["X"], ["Out"]],
-    "unsqueeze2": [["X"], ["Out"]],
-    "flatten_contiguous_range": [["X"], ["Out"]],
-    "split": [["X"], ["Out"]],
-    "squeeze2": [["X"], ["Out"]],
-    "nearest_interp_v2": [["X"], ["Out"]],
-    "bilinear_interp": [["X"], ["Out"]],
-    "bilinear_interp_v2": [["X"], ["Out"]],
-    "fill_constant_batch_size_like": [["Input"], ["Out"]],
-    "arg_max": [["X"], ["Out"]],
-    "abs": [["X"], ["Out"]],
-    "assign": [["X"], ["Out"]],
-    "cast": [["X"], ["Out"]],
-    "clip": [["X"], ["Out"]],
-    "box_coder": [["PriorBox"], ["OutputBox"]],
-    "crop": [["X"], ["Out"]],
-    "cumsum": [["X"], ["Out"]],
-    "expand_v2": [["X"], ["Out"]],
-    "fill_any_like": [["X"], ["Out"]],
-    "fill_constant": [[], ["Out"]],
-    "gelu": [["X"], ["Out"]],
-    "instance_norm": [["X"], ["Out"]],
-    "lookup_table": [["W", "Ids"], ["Out"]],
-    "lookup_table_v2": [["W", "Ids"], ["Out"]],
-    "norm": [["X"], ["Norm"]],
-    "p_norm": [["X"], ["Out"]],
-    "pow": [["X"], ["Out"]],
-    "reduce_mean": [["X"], ["Out"]],
-    "stack": [["X"], ["Y"]],
-    "top_k_v2": [["X"], ["Out", "Indices"]],
-    "logical_and": [["X", "Y"], ["Out"]],
-    "logical_not": [["X"], ["Out"]],
-    "meshgrid": [["X"], ["Out"]],
-    "roi_align": [["X", "ROIs"], ["Out"]],
-    "strided_slice": [["Input"], ["Out"]],
-    "where": [["Condition", "X", "Y"], ["Out"]],
-    "grid_sampler": [["X", "Grid"], ["Output"]],
-    "tile": [["X"], ["Out"]],
-    "group_norm": [["X"], ["Y", "Mean", "Variance"]],
-    "reduce_sum": [["X"], ["Out"]],
-    "square": [["X"], ["Out"]],
-    "softplus": [["X"], ["Out"]],
-    "shuffle_channel": [["X"], ["Out"]],
-}
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
+_SCALE_DEFAULT_VALUE = 0.001
-def _get_op_input_var_names(op):
-    """
-    Get the input var names of the op.
-    Args:
-        op(IrNode, Operator): the input op.
-    Returns:
-        input_var_names or None.
-    """
-    assert isinstance(op, (IrNode, Operator)), \
-        "The input op should be IrNode or Operator."
-    var_names = []
-    op_name = op.name() if isinstance(op, IrNode) \
-        else op.type
-    if op_name not in _op_real_in_out_name:
-        return []
-    name_list = _op_real_in_out_name[op_name][0]
-    for name in name_list:
-        var_name = op.input(name)
-        if isinstance(var_name, list):
-            var_names.extend(var_name)
-        else:
-            var_names.append(var_name)
-    return var_names
-def _get_input_name_index(op, input_var_name):
-    """Get the input name and index of the var_name in the op"""
-    assert isinstance(op, (IrNode, Operator)), \
-        "The input op should be IrNode or Operator."
-    op_name = op.name() if isinstance(op, IrNode) \
-        else op.type
-    if op_name not in _op_real_in_out_name:
-        return None
-    res = None
-    for argname in _op_real_in_out_name[op_name][0]:
-        var_names = op.input(argname)
-        for index, name in enumerate(var_names):
-            if name == input_var_name:
-                res = (argname, index)
-    return res
-def _get_op_output_var_names(op):
-    """ """
-    assert isinstance(op, (IrNode, Operator)), \
-        "The input op should be IrNode or Operator."
-    var_names = []
-    op_name = op.name() if isinstance(op, IrNode) \
-        else op.type
-    if op_name not in _op_real_in_out_name:
-        return []
-    name_list = _op_real_in_out_name[op_name][1]
-    for name in name_list:
-        var_name = op.output(name)
-        if isinstance(var_name, list):
-            var_names.extend(var_name)
-        else:
-            var_names.append(var_name)
-    return var_names
-def _get_output_name_index(op, output_var_name):
-    """Get the output name and index of the var_name in the op"""
-    assert isinstance(op, (IrNode, Operator)), \
-        "The input op should be IrNode or Operator."
-    op_name = op.name() if isinstance(op, IrNode) \
-        else op.type
-    if op_name not in _op_real_in_out_name:
-        return None
-    name_list = _op_real_in_out_name[op_name][1]
-    res = None
-    for name in name_list:
-        var_name = op.output(name)
-        for index, val in enumerate(var_name):
-            if val == output_var_name:
-                res = (name, index)
-    return res
 def _init_var_node(var_node, value, scope, place):
@@ -334,7 +77,7 @@ def _is_input_all_not_persistable(graph, op_node):
    Analyse the real inputs of the op node are all not persistable.
    '''
    is_input_all_not_persistable = True
-    for var_name in _get_op_input_var_names(op_node):
+    for var_name in utils._get_op_input_var_names(op_node):
        in_node = graph._find_node_by_name(op_node.inputs, var_name)
        is_input_all_not_persistable = (is_input_all_not_persistable and \
            (not in_node.persistable()))
@@ -360,10 +103,6 @@ class QuantizationTransformPass(object):
    Quantize the ops that have weights. Add quant and dequant ops for
    the quantized ops's inputs.
    """
-    _supported_quantizable_op_type = [
-        'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul',
-        'matmul_v2'
-    ]
    def __init__(self,
                 scope=None,
@@ -493,7 +232,7 @@ class QuantizationTransformPass(object):
        self._quantizable_ops = quantizable_op_type
        for op in self._quantizable_ops:
-            assert op in QuantizationTransformPass._supported_quantizable_op_type, \
+            assert op in utils._weight_supported_quantizable_op_type, \
                op + " is not supported for quantization."
        self._quantizable_grad_ops = [
            '%s_grad' % (op) for op in self._quantizable_ops
@@ -588,7 +327,7 @@ class QuantizationTransformPass(object):
                        else self._activation_quantize_type
                    if quant_type == 'channel_wise_abs_max':  # Weight quantization
                        quant_axis = 1 if op.name() in \
-                            _channelwise_quant_axis1_ops else 0
+                            utils._channelwise_quant_axis1_ops else 0
                        quant_var_node, scale_var_node = self._insert_channel_quant_op(
                            graph, var_node, name, quant_bits, quant_axis)
                        dequant_var_node = self._insert_channel_dequant_op(
@@ -753,7 +492,7 @@ class QuantizationTransformPass(object):
        _init_var_node(
            scale_in_node,
            np.array(
-                [0.001], dtype=data_type),
+                [_SCALE_DEFAULT_VALUE], dtype=data_type),
            self._scope,
            self._place)
@@ -821,7 +560,7 @@ class QuantizationTransformPass(object):
        _init_var_node(
            scale_in_node,
            np.array(
-                [0.001], dtype=data_type),
+                [_SCALE_DEFAULT_VALUE], dtype=data_type),
            self._scope,
            self._place)
@@ -1289,17 +1028,21 @@ class QuantizationFreezePass(object):
                    if self._round_type == 'round':
                        if any(
                                _check_grandchild_op_node(op_node, op)
-                                for op in _channelwise_quant_axis1_ops):
+                                for op in utils._channelwise_quant_axis1_ops):
                            quant_axis = 1
                        else:
                            quant_axis = 0
-                        quantized_param_v = quant_tensor(param_v.copy(),
+                        quantized_param_v = utils.quant_tensor(
-                                                         scale_v, quant_axis,
+                            param_v.copy(), scale_v, quant_axis,
-                                                         self._weight_bits)
+                            self._weight_bits)
                        quantized_param_v = np.round(quantized_param_v)
                        if self._bias_correction == True:
-                            quantized_param_v = self._bias_correction_w(
+                            quantized_param_v = utils.bias_correction_w(
-                                param_v, quantized_param_v, scale_v, quant_axis)
+                                param_v,
+                                quantized_param_v,
+                                scale_v,
+                                quant_axis,
+                                weight_bits=self._weight_bits)
                            quantized_param_v = np.round(quantized_param_v)
                        self._restore_var(input_arg_name, quantized_param_v)
                    self._remove_fake_quant_and_dequant_op(graph, op_node)
@@ -1319,7 +1062,7 @@ class QuantizationFreezePass(object):
                op_node_desc.attr("quantization_type") == "qat_with_weight":
                if self._weight_quantize_type == 'channel_wise_abs_max':
                    quant_axis = 1 if op_node.name() in \
-                        _channelwise_quant_axis1_ops else 0
+                        utils._channelwise_quant_axis1_ops else 0
                    self._insert_post_channel_dequant_op(graph, op_node,
                                                         quant_axis)
                else:
@@ -1519,46 +1262,6 @@ class QuantizationFreezePass(object):
        return isinstance(v, float) or isinstance(v, np.float32) \
            or isinstance(v, np.float64)
-    def _bias_correction_w(self, x, x_quant, scale_v, quant_axis):
-        '''
-        Bias correction for weight
-        '''
-        eps = 1e-8
-        bnt = (1 << (self._weight_bits - 1)) - 1
-        x_dequant = x_quant.copy()
-        if isinstance(scale_v, list):
-            if quant_axis == 0:
-                for i, s in enumerate(scale_v):
-                    x_dequant[i] = x_dequant[i] * s / bnt
-                quant_bias = x - x_dequant
-                mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
-                std_orig = x.reshape(x.shape[0], -1).std(-1)
-                std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
-                std_bias = std_orig / (std_quant + eps)
-            else:
-                for i, s in enumerate(scale_v):
-                    x_dequant[:, i] = x_quant[:, i] * s / bnt
-                quant_bias = x - x_dequant
-                mean_bias = np.array([
-                    quant_bias[:, i].mean() for i in range(quant_bias.shape[1])
-                ])
-                std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
-                std_quant = np.array(
-                    [x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
-                std_bias = std_orig / (std_quant + eps)
-        else:
-            x_dequant = x_quant * scale_v / bnt
-            mean_bias = (x - x_dequant).mean()
-            std_bias = x.std() / (x_dequant.std() + eps)
-        if mean_bias.ndim == 1:
-            std_bias = np.resize(std_bias, x.shape)
-            mean_bias = np.resize(mean_bias, x.shape)
-        x_dequant = (mean_bias + x_dequant) * std_bias
-        quantized_param_v = quant_tensor(x_dequant, scale_v, quant_axis,
-                                         self._weight_bits)
-        return quantized_param_v
 class ConvertToInt8Pass(object):
    def __init__(self, scope, place, quantizable_op_type=None):
@@ -1707,7 +1410,7 @@ class OutScaleForTrainingPass(object):
        self._place = _get_paddle_place(place)
        self._moving_rate = moving_rate
        self._is_test = None
-        self._teller_set = _out_scale_op_list
+        self._teller_set = utils._out_scale_op_list
    def apply(self, graph):
        """
@@ -1725,7 +1428,7 @@ class OutScaleForTrainingPass(object):
            if op.name() in self._teller_set:
                target_ops.append(op)
        for op in target_ops:
-            for output_var_name in _get_op_output_var_names(op):
+            for output_var_name in utils._get_op_output_var_names(op):
                in_node = graph._find_node_by_name(op.outputs, output_var_name)
                if in_node.dtype() not in \
                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
@@ -1796,14 +1499,13 @@ class OutScaleForTrainingPass(object):
                    graph.link_to(accum_in_node, scale_op_node)
                    graph.link_to(scale_op_node, state_out_node)
                    graph.link_to(scale_op_node, accum_out_node)
-        graph.resolve_hazard()
        return graph
    def _scale_name(self, var_name):
        """
        Return the scale name for the var named `var_name`.
        """
-        return "%s@scale" % (var_name)
+        return "%s.scale" % (var_name)
 class OutScaleForInferencePass(object):
@@ -1816,7 +1518,7 @@ class OutScaleForInferencePass(object):
            scope(fluid.Scope): The scope is used to initialize these new parameters.
        """
        self._scope = scope
-        self._teller_set = _out_scale_op_list
+        self._teller_set = utils._out_scale_op_list
    def apply(self, graph):
        """
@@ -1831,7 +1533,7 @@ class OutScaleForInferencePass(object):
        op_nodes = graph.all_op_nodes()
        for op_node in op_nodes:
            if op_node.name() in self._teller_set:
-                var_names = _get_op_output_var_names(op_node)
+                var_names = utils._get_op_output_var_names(op_node)
                for var_name in var_names:
                    in_node = graph._find_node_by_name(op_node.outputs,
                                                       var_name)
@@ -1848,7 +1550,8 @@ class OutScaleForInferencePass(object):
                    # For compatibility, we save output threshold by two methods.
                    op_node.op()._set_attr("out_threshold", float(scale_value))
-                    argname_index = _get_output_name_index(op_node, var_name)
+                    argname_index = utils._get_output_name_index(op_node,
+                                                                 var_name)
                    assert argname_index is not None, \
                        var_name + " is not the output of the op"
                    op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \
@@ -1861,7 +1564,7 @@ class OutScaleForInferencePass(object):
        """
        Return the scale name for the var named `var_name`.
        """
-        return "%s@scale" % (var_name)
+        return "%s.scale" % (var_name)
 class AddQuantDequantPass(object):
@@ -1869,95 +1572,6 @@ class AddQuantDequantPass(object):
    Quantize the ops that do not have weights, and add quant_dequant op for the 
    quantized ops's inputs.
    """
-    _supported_quantizable_op_type = [
-        "pool2d",
-        "elementwise_add",
-        "concat",
-        "softmax",
-        "argmax",
-        "transpose",
-        "equal",
-        "gather",
-        "greater_equal",
-        "greater_than",
-        "less_equal",
-        "less_than",
-        "mean",
-        "not_equal",
-        "reshape",
-        "reshape2",
-        "dropout",
-        "bilinear_interp",
-        "nearest_interp",
-        "trilinear_interp",
-        "slice",
-        "squeeze",
-        "elementwise_sub",
-        "mul",
-        "matmul",
-        "relu",
-        "relu6",
-        "leaky_relu",
-        "tanh",
-        "swish",
-        "scale",
-        "transpose",
-        "transpose2",
-        "sigmoid",
-        "pad2d",
-        "flatten",
-        "flatten2",
-        "batch_norm",
-        "layer_norm",
-        "matmul_v2",
-        "split",
-        "flatten_contiguous_range",
-        "squeeze2",
-        "nearest_interp_v2",
-        "bilinear_interp",
-        "bilinear_interp_v2",
-        "fill_constant_batch_size_like",
-        "arg_max",
-        "abs",
-        "assign",
-        "cast",
-        "clip",
-        "box_coder",
-        "crop",
-        "cumsum",
-        "elementwise_mul",
-        "elementwise_pow",
-        "expand_v2",
-        "fill_any_like",
-        "fill_constant",
-        "gelu",
-        "hard_sigmoid",
-        "hard_swish",
-        "instance_norm",
-        "lookup_table",
-        "lookup_table_v2",
-        "norm",
-        "p_norm",
-        "pad3d",
-        "pow",
-        "prelu",
-        "reduce_mean",
-        "unsqueeze",
-        "unsqueeze2",
-        "logical_and",
-        "logical_not",
-        "meshgrid",
-        "roi_align",
-        "strided_slice",
-        "where",
-        "grid_sampler",
-        "tile",
-        "group_norm",
-        "reduce_sum",
-        "square",
-        "softplus",
-        "shuffle_channel",
-    ]
    # To be compatible with PaddleSlim, not remove _activation_type for now
    _activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"]
@@ -2000,12 +1614,11 @@ class AddQuantDequantPass(object):
        self._skip_pattern = skip_pattern
        if is_full_quantized:
-            self._quantizable_op_type = \
+            self._quantizable_op_type = utils._act_supported_quantizable_op_type
-                AddQuantDequantPass._supported_quantizable_op_type
        else:
            self._quantizable_op_type = quantizable_op_type
            for op_type in quantizable_op_type:
-                assert op_type in AddQuantDequantPass._supported_quantizable_op_type, \
+                assert op_type in utils._act_supported_quantizable_op_type, \
                    op_type + " is not supported for quantization."
        self._quantizable_grad_op_type = [
            '%s_grad' % (op) for op in self._quantizable_op_type
@@ -2050,7 +1663,7 @@ class AddQuantDequantPass(object):
                                       "qat_without_weight")
                op_node.op()._set_attr("activation_bits", self._quant_bits)
                op_node.op()._set_attr("with_quant_attr", True)
-                arg_names = _get_op_input_var_names(op_node)
+                arg_names = utils._get_op_input_var_names(op_node)
                for arg_name in arg_names:
                    in_node = graph._find_node_by_name(op_node.inputs, arg_name)
                    if arg_name in dequantized_vars_map:
@@ -2095,7 +1708,7 @@ class AddQuantDequantPass(object):
        _init_var_node(
            scale_in_node,
            np.array(
-                [0.001], dtype=data_type),
+                [_SCALE_DEFAULT_VALUE], dtype=data_type),
            self._scope,
            self._place)
@@ -2162,3 +1775,870 @@ class AddQuantDequantPass(object):
            graph.link_to(quant_op_node, accum_out_node)
        return quant_var_node, scale_out_node
+class InsertQuantizeLinear(object):
+    """
+    Insert quantize_linear and dequantize_linear op before ops.
+    Args:
+        place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to restore the weight tensors.
+            If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
+        scope(paddle.Scope): scope is used to get the weight tensor values.
+        quant_bits(int, optional): quantization bit number for weight. Default is 8.
+        quant_axis(int, optional): quantization dimension of channels. When it is greater than or
+            equal to 0, it will quantization with per channel, else quantization with per layer.
+            Default is -1.
+        channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
+        is_test(bool, optional): Whether quantization with training or not. Default is True.
+    """
+    def __init__(self,
+                 place,
+                 scope,
+                 quant_bits=8,
+                 quant_axis=-1,
+                 channel_wise=False,
+                 is_test=True):
+        self._place = place
+        self._scope = scope
+        self.quant_bits = quant_bits
+        self.quant_axis = quant_axis
+        self.channel_wise = channel_wise
+        self._is_test = is_test
+    def insert_quant_op(self, graph, var_node):
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        data_type = 'float64' if var_node.dtype(
+        ) == core.VarDesc.VarType.FP64 else 'float32'
+        if self.channel_wise:
+            scale_var_shape = var_node.shape()[self.quant_axis]
+            scale_var_type = core.VarDesc.VarType.LOD_TENSOR
+            init_scale_value = np.zeros(scale_var_shape, dtype=data_type)
+        else:
+            scale_var_shape = 1
+            scale_var_type = var_node.type()
+            init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type)
+        scale_var_node = graph.create_persistable_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=scale_var_type,
+            shape=[scale_var_shape],
+            var_dtype=var_node.dtype())
+        _init_var_node(scale_var_node, init_scale_value, self._scope,
+                       self._place)
+        zero_point_node = None
+        if zero_point_node is None:
+            zero_point_node = graph.create_persistable_node(
+                name=self._zero_point_name(quant_var_node.name()),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=scale_var_node.shape(),
+                var_dtype=core.VarDesc.VarType.INT32)
+            _init_var_node(
+                zero_point_node,
+                np.zeros(
+                    scale_var_node.shape(), dtype="int32"),
+                self._scope,
+                self._place)
+        inputs = {"X": var_node, "Scale": scale_var_node}
+        if zero_point_node is not None:
+            inputs["ZeroPoint"] = zero_point_node
+        attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
+        outputs = {"Y": quant_var_node}
+        if not self._is_test:
+            attrs["is_test"] = self._is_test
+            attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
+            scale_out_node = graph.create_var_node_from_desc(scale_var_node.var(
+            ))
+            outputs["OutScale"] = scale_out_node
+        quant_op_node = graph.create_op_node(
+            op_type="quantize_linear",
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_var_node, quant_op_node)
+        if zero_point_node is not None:
+            graph.link_to(zero_point_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        if not self._is_test:
+            graph.link_to(quant_op_node, scale_out_node)
+        return quant_var_node, scale_var_node
+    def insert_dequant_op(self, graph, var_node, scale_var_node):
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        zero_point_node = None
+        if zero_point_node is None:
+            zero_point_node = graph.create_persistable_node(
+                name=self._zero_point_name(dequant_var_node.name()),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=scale_var_node.shape(),
+                var_dtype=core.VarDesc.VarType.INT32)
+            _init_var_node(
+                zero_point_node,
+                np.zeros(
+                    scale_var_node.shape(), dtype="int32"),
+                self._scope,
+                self._place)
+        inputs = {"X": var_node, "Scale": scale_var_node}
+        if zero_point_node is not None:
+            inputs["ZeroPoint"] = zero_point_node
+        attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
+        if not self._is_test:
+            attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
+        quant_op_node = graph.create_op_node(
+            op_type="dequantize_linear",
+            attrs=attrs,
+            inputs=inputs,
+            outputs={"Y": dequant_var_node})
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_var_node, quant_op_node)
+        if zero_point_node is not None:
+            graph.link_to(zero_point_node, quant_op_node)
+        graph.link_to(quant_op_node, dequant_var_node)
+        return dequant_var_node
+    def _quantized_var_name(self, var_name):
+        """
+        Return quantized variable name for the input `var_name`.
+        """
+        return "%s.quantized" % (var_name)
+    def _dequantized_var_name(self, var_name):
+        """
+        Return dequantized variable name for the input `var_name`.
+        """
+        return "%s.dequantized" % (var_name)
+    def _quantized_scale_name(self, var_name):
+        """
+        Return the scale name of quantized variable for the input `var_name`.
+        """
+        return "%s.scale" % (var_name)
+    def _zero_point_name(self, var_name):
+        """
+        Return the scale name for the var named `var_name`.
+        """
+        return "%s@zero_point" % (var_name)
+class QuantizationTransformPassV2(object):
+    """
+    Quantize the ops that have weights. Add quant and dequant ops for
+    the quantized ops's inputs.
+    """
+    def __init__(self,
+                 scope=None,
+                 place=None,
+                 weight_bits=8,
+                 activation_bits=8,
+                 activation_quantize_type='abs_max',
+                 weight_quantize_type='abs_max',
+                 window_size=10000,
+                 moving_rate=0.9,
+                 skip_pattern=['skip_quant'],
+                 quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
+                 weight_quantize_func=None,
+                 act_quantize_func=None,
+                 weight_preprocess_func=None,
+                 act_preprocess_func=None,
+                 optimizer_func=None,
+                 executor=None):
+        r"""
+        Args:
+            scope(paddle.Scope): When activation use 'range_abs_max' as the quantize
+                type, this pass will create some new parameters. The scope is used to
+                initialize these new parameters.
+            place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
+                parameters described above. If it's string, It can be ``cpu``, and ``gpu:x``,
+                where ``x`` is the index of the GPUs. 
+            weight_bits(int): quantization bit number for weights,
+                the bias is not quantized.
+            activation_bits(int): quantization bit number for activation.
+            activation_quantize_type(str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+                If use 'abs_max' mode, the quantization scale will be calculated
+                dynamically each step in both training and testing period. If use
+                'range_abs_max', a static quantization scale will be calculated
+                during training and used in inference.
+            weight_quantize_type(str): quantization type for weights,
+                support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
+                usually is not used for weight, since weights are fixed once the
+                model is well trained.
+            window_size(int): the window size for 'range_abs_max' quantization.
+            moving_rate(float): the param for 'moving_average_abs_max' quantization.
+            skip_pattern(str or str list): The user-defined quantization skip pattern, which
+                will be presented in the name scope of an op. When the skip pattern is
+                detected in an op's name scope, the corresponding op will not be quantized. 
+            quantizable_op_type(list[str]): List the type of ops that will be quantized. 
+                Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
+                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
+            weight_quantize_func(function): Function that defines how to quantize weight.
+                Using this can quickly test if user's quantization method works or not.
+                In this function, user should both define quantization function and
+                dequantization function, that is, the function's input is non-quantized
+                weight and function returns dequantized weight. If None, will use
+                quantization op defined by 'weight_quantize_type'. Default is None.
+            act_quantize_func(function): Function that defines how to quantize activation.
+                Using this can quickly test if user's quantization method works or not.
+                In this function, user should both define quantization and dequantization
+                process, that is, the function's input is non-quantized activation and
+                function returns dequantized activation. If None, will use quantization
+                op defined by 'activation_quantize_type'. Default is None.
+            weight_preprocess_func(function): Function that defines how to preprocess
+                weight before quantization. Using this can quickly test if user's preprocess
+                method works or not. The function's input is non-quantized weight and
+                function returns processed weight to be quantized. If None, the weight will
+                be quantized directly. Default is None.
+            act_preprocess_func(function): Function that defines how to preprocess
+                activation before quantization. Using this can quickly test if user's
+                preprocess method works or not. The function's input is non-quantized
+                activation and function returns processed activation to be quantized.
+                If None, the activation will be quantized directly. Default is None.
+            optimizer_func(function): Fuction return a optimizer. When 'is_test' is
+                False and user want to use self-defined quantization function and
+                preprocess function, this function must be set. Default is None.
+            executor(paddle.Executor): If user want to use self-defined quantization
+                function and preprocess function, executor must be set for initialization.
+                Default is None.
+        Examples:
+        .. code-block:: python
+            # The original graph will be rewrite.
+            import paddle
+            from paddle.fluid.contrib.slim.quantization \
+                import QuantizationTransformPassV2
+            from paddle.fluid.contrib.slim.graph import IrGraph
+            from paddle.fluid import core
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            place = paddle.CPUPlace()
+            scope = paddle.static.global_scope()
+            transform_pass = QuantizationTransformPassV2(scope, place)
+            transform_pass.apply(graph)
+        """
+        self._scope = scope
+        self._place = _get_paddle_place(place)
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
+        self._skip_pattern = skip_pattern
+        self._weight_quantize_func = weight_quantize_func
+        self._act_quantize_func = act_quantize_func
+        self._weight_preprocess_func = weight_preprocess_func
+        self._act_preprocess_func = act_preprocess_func
+        self._optimizer = optimizer_func
+        self._exe = executor
+        quant_type = [
+            'abs_max', 'channel_wise_abs_max', 'range_abs_max',
+            'moving_average_abs_max'
+        ]
+        assert activation_quantize_type != 'channel_wise_abs_max', \
+            "The activation quantization type does not support 'channel_wise_abs_max'."
+        if activation_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown activation_quantize_type : '%s'. It can only be "
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." %
+                (str(activation_quantize_type)))
+        if weight_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown weight_quantize_type: '%s'. It can only be "
+                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' "
+                "or 'moving_average_abs_max'." % (str(weight_quantize_type)))
+        self._activation_quantize_type = activation_quantize_type
+        self._weight_quantize_type = weight_quantize_type
+        self._window_size = window_size
+        self._moving_rate = moving_rate
+        self._quantizable_ops = quantizable_op_type
+        for op in self._quantizable_ops:
+            assert op in utils._weight_supported_quantizable_op_type, \
+                op + " is not supported for quantization."
+        self._quantizable_grad_ops = [
+            '%s_grad' % (op) for op in self._quantizable_ops
+        ]
+        self._is_test = None
+        self._global_step = None
+        self.create_var_map = {}
+        self.create_op_map = {}
+        # marked the variable which has been dequantized.
+        self.dequantized_vars = collections.OrderedDict()
+        self.persistable_vars = []
+        self.processed_vars = []
+    def _quant_preprocess(self, op_node):
+        user_skipped = False
+        if isinstance(self._skip_pattern, list):
+            user_skipped = op_node.op().has_attr("op_namescope") and \
+                            any(pattern in op_node.op().attr("op_namescope") \
+                                for pattern in self._skip_pattern)
+        elif isinstance(self._skip_pattern, str):
+            user_skipped = op_node.op().has_attr("op_namescope") and \
+                            op_node.op().attr("op_namescope").find(
+                                self._skip_pattern) != -1
+        if user_skipped:
+            op_node.op()._set_attr("skip_quant", True)
+            op_node.op()._set_attr("with_quant_attr", True)
+    def _transform_forward(self, graph, op):
+        op.op()._set_attr("quantization_type", "qat_with_weight")
+        inputs = op.inputs
+        for var_node in inputs:
+            if var_node.name() not in op.input_arg_names():
+                continue
+            if var_node.name() in self.dequantized_vars:
+                dequant_var_node = self.dequantized_vars[var_node.name()]
+            else:
+                name = var_node.name()
+                if name in self.processed_vars:
+                    continue
+                is_weight = True if var_node.name() in self.persistable_vars \
+                    else False
+                # if var node is weight and weight_preprocess_func is not None,
+                # will insert weight preprocess func 
+                # to preorocess weight before quantization
+                # if var node is activation and act_preprocess_func is not None, 
+                # will insert activation preprocess func 
+                # to preorocess activation before quantization
+                if is_weight and self._weight_preprocess_func is not None:
+                    var_node = self._insert_func(
+                        graph, self._weight_preprocess_func, var_node, op)
+                elif not is_weight and self._act_preprocess_func is not None:
+                    var_node = self._insert_func(
+                        graph, self._act_preprocess_func, var_node, op)
+                # if var node is weight and weight_quantize_func is not None,
+                # will insert weight quantize func to quantize and dequantize weight
+                # if var node is activation and act_quantize_func is not None,
+                # will insert act quantize func to quantize and dequantize activation
+                if is_weight and self._weight_quantize_func is not None:
+                    target_out_node = self._insert_func(
+                        graph, self._weight_quantize_func, var_node, op)
+                    processed_vars.append(name)
+                    continue
+                elif not is_weight and self._act_quantize_func is not None:
+                    target_out_node = self._insert_func(
+                        graph, self._act_quantize_func, var_node, op)
+                    processed_vars.append(name)
+                    continue
+                quant_bits = self._weight_bits if var_node.name() in self.persistable_vars \
+                    else self._activation_bits
+                quant_type = self._weight_quantize_type if is_weight \
+                    else self._activation_quantize_type
+                quant_axis = -1
+                channel_wise = False
+                if quant_type == 'channel_wise_abs_max':  # Weight quantization
+                    channel_wise = True
+                    quant_axis = 1 if op.name() in \
+                        utils._channelwise_quant_axis1_ops else 0
+                insert_quant_pass = InsertQuantizeLinear(
+                    self._place,
+                    self._scope,
+                    quant_bits=quant_bits,
+                    quant_axis=quant_axis,
+                    channel_wise=channel_wise,
+                    is_test=self._is_test)
+                quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
+                    graph, var_node)
+                dequant_var_node = insert_quant_pass.insert_dequant_op(
+                    graph, quant_var_node, scale_var_node)
+                self.dequantized_vars[name] = dequant_var_node
+            graph.update_input_link(var_node, dequant_var_node, op)
+    def _transform_backward(self, graph, op):
+        for var_node in op.inputs:
+            if var_node.name() not in op.input_arg_names():
+                continue
+            if var_node.name() in self.dequantized_vars:
+                dequant_var_node = self.dequantized_vars[var_node.name()]
+                graph.update_input_link(var_node, dequant_var_node, op)
+    def _has_weight(self, op):
+        has_weight = False
+        for var_node in op.inputs:
+            if var_node.name() not in op.input_arg_names():
+                continue
+            name = var_node.name()
+            if var_node.name() in self.persistable_vars:
+                has_weight = True
+        return has_weight
+    def _is_skip_quant(self, graph, op_node):
+        """
+        Analyse whether the op node skips quantization.
+        """
+        is_skip = False
+        if op_node.op().has_attr("skip_quant") and \
+            op_node.op().attr("skip_quant"):
+            is_skip = True
+        # if the inputs of mul and matmul are not all persistable, use
+        # AddQuantDequantPassV2 to quantize them.
+        if op_node.name() in ["mul", "matmul", "matmul_v2"] and \
+            _is_input_all_not_persistable(graph, op_node):
+            is_skip = True
+        if op_node.op().has_attr("quantization_type") and \
+            op_node.op().attr("quantization_type") == "qat_without_weight":
+            is_skip = True
+        return is_skip
+    def apply(self, graph):
+        """
+        Quantize the graph for training process. According to weight and
+        activation quantization type, the graph will be added some fake
+        quantize operators and fake dequantize operators.
+        Args:
+            graph(IrGraph): the applied graph.
+        Returns:
+            None
+        """
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        self._is_test = graph.is_test()
+        self.persistable_vars = [
+            p.name() for p in graph.all_persistable_nodes()
+        ]
+        ops = graph.all_op_nodes()
+        # Do the preproccess of quantization, such as skipping some ops
+        # for not being quantized.
+        for op in ops:
+            if op.name() in self._quantizable_ops or \
+                    op.name() in self._quantizable_grad_ops:
+                self._quant_preprocess(op)
+        # Insert mapping table to solve the problem in saving inference model.
+        graph.out_node_mapping_table = dict()
+        # The process of _transform_forward and _transform_backward is needed in two for loops.
+        # The loop for transforming the forward graph:
+        for op in ops:
+            if op.name() in self._quantizable_ops:
+                if not self._is_skip_quant(graph, op) and self._has_weight(op):
+                    self._transform_forward(graph, op)
+        # The loop for renaming the inputs of backward op.
+        for op in ops:
+            if op.name() in self._quantizable_grad_ops and self._has_weight(op):
+                self._transform_backward(graph, op)
+        return graph
+class AddQuantDequantPassV2(object):
+    """
+    Quantize the ops that do not have weights, and add quant_linear and dequant_linear
+    op for the quantized ops's inputs.
+    """
+    # To be compatible with PaddleSlim, not remove _activation_type for now
+    _activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"]
+    def __init__(self,
+                 scope=None,
+                 place=None,
+                 moving_rate=0.9,
+                 quant_bits=8,
+                 skip_pattern=["skip_quant"],
+                 quantizable_op_type=["elementwise_add", "pool2d"],
+                 is_full_quantized=False):
+        """
+        Args:
+            scope(paddle.Scope): The scope is used to initialize these new parameters.
+            place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
+                parameters described above. If ``place`` is string, it can be It can be ``cpu``
+                or ``gpu:x``, where ``x`` is the index of the GPUs.
+            moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max' 
+                quantization. Default is 0.9.
+            quant_bits(int, optional): quantization bit number for activation. Default is 8.
+            skip_pattern(str, optional): The user-defined quantization skip pattern, which
+                will be presented in the name scope of an op. When the skip pattern is
+                detected in an op's name scope, the corresponding op will not be quantized.
+                Default is 'skip_quant'.
+            quantizable_op_type(list[str], optional): List the type of ops that will be 
+                quantized. Default is ["elementwise_add", "pool2d"]. 
+            is_full_quantized(bool, optional): If set is_full_quantized as True, apply 
+                quantization to all supported quantizable op type. If set is_full_quantized
+                as False, only apply quantization to the op type according to the input 
+                quantizable_op_type.
+        Examples:
+        .. code-block:: python
+            # The original graph will be rewrite.
+            import paddle
+            from paddle.fluid.contrib.slim.quantization \
+                import AddQuantDequantPassV2
+            from paddle.fluid.contrib.slim.graph import IrGraph
+            from paddle.fluid import core
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            place = paddle.CPUPlace()
+            scope = paddle.static.global_scope()
+            add_quant_dequant_pass = AddQuantDequantPassV2(scope, place)
+            add_quant_dequant_pass.apply(graph)
+        """
+        self._scope = scope
+        self._place = _get_paddle_place(place)
+        self._moving_rate = moving_rate
+        self._quant_bits = quant_bits
+        self._is_test = None
+        self._skip_pattern = skip_pattern
+        if is_full_quantized:
+            self._quantizable_op_type = utils._act_supported_quantizable_op_type
+        else:
+            self._quantizable_op_type = quantizable_op_type
+            for op_type in quantizable_op_type:
+                assert op_type in utils._act_supported_quantizable_op_type, \
+                    op_type + " is not supported for quantization."
+        self._quantizable_grad_op_type = [
+            '%s_grad' % (op) for op in self._quantizable_op_type
+        ]
+        assert self._scope != None, "scope must not be None."
+        assert self._place != None, "place must not be None."
+        self.persistable_vars = []
+    def apply(self, graph):
+        """
+        Add quant_dequant before some ops, such as the 'elementwise_add' and
+        'pool2d' op.
+        Args:
+            graph(IrGraph): the target graph.
+        Returns:
+            None
+        """
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        self._is_test = graph.is_test()
+        dequantized_vars_map = collections.OrderedDict()
+        self.persistable_vars = [
+            p.name() for p in graph.all_persistable_nodes()
+        ]
+        # Forward stage, insert quant_dequant op
+        all_op_nodes = graph.all_op_nodes()
+        for op_node in all_op_nodes:
+            if op_node.name() in self._quantizable_op_type:
+                is_skip = False
+                if isinstance(self._skip_pattern, list):
+                    is_skip = op_node.op().has_attr("op_namescope") and \
+                                   any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
+                elif isinstance(self._skip_pattern, str):
+                    is_skip = op_node.op().has_attr("op_namescope") and \
+                                   op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
+                is_quantized = op_node.op().has_attr("quantization_type") and \
+                    op_node.op().attr("quantization_type") == "qat_with_weight"
+                if is_skip or is_quantized:
+                    continue
+                op_node.op()._set_attr("quantization_type",
+                                       "qat_without_weight")
+                arg_names = utils._get_op_input_var_names(op_node)
+                for arg_name in arg_names:
+                    in_node = graph._find_node_by_name(op_node.inputs, arg_name)
+                    if in_node.persistable():
+                        continue
+                    if arg_name in dequantized_vars_map:
+                        dequant_var_node = dequantized_vars_map[arg_name]
+                    else:
+                        insert_quant_pass = InsertQuantizeLinear(
+                            self._place,
+                            self._scope,
+                            quant_bits=self._quant_bits,
+                            quant_axis=-1,
+                            channel_wise=False,
+                            is_test=self._is_test)
+                        quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
+                            graph, in_node)
+                        dequant_var_node = insert_quant_pass.insert_dequant_op(
+                            graph, quant_var_node, scale_var_node)
+                        dequantized_vars_map[arg_name] = dequant_var_node
+                    graph.update_input_link(in_node, dequant_var_node, op_node)
+        # Backward stage, update input link
+        for op_node in all_op_nodes:
+            if op_node.name() in self._quantizable_grad_op_type:
+                for input_name in op_node.input_arg_names():
+                    if input_name in dequantized_vars_map:
+                        in_node = graph._find_node_by_name(op_node.inputs,
+                                                           input_name)
+                        dequant_var_node = dequantized_vars_map[input_name]
+                        graph.update_input_link(in_node, dequant_var_node,
+                                                op_node)
+        return graph
+class ReplaceFakeQuantDequantPass(object):
+    """
+    replace quant-dequant ops with quantize_linear and dequantize_linear ops.
+    """
+    def __init__(self, scope, place):
+        r"""
+        Args:
+            scope(paddle.Scope): The scope is used to initialize these new parameters.
+            place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
+                parameters described above. If ``place`` is string, it can be It can be ``cpu``
+                or ``gpu:x``, where ``x`` is the index of the GPUs.
+        Examples:
+        .. code-block:: python
+            # The original graph will be rewrite.
+            import paddle
+            from paddle.fluid.contrib.slim.quantization \
+                import ReplaceFakeQuantDequantPass
+            from paddle.fluid.contrib.slim.graph import IrGraph
+            from paddle.fluid import core
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            place = paddle.CPUPlace()
+            scope = paddle.static.global_scope()
+            replace_pass = ReplaceFakeQuantDequantPass(scope, place)
+            replace_pass.apply(graph)
+        """
+        self._place = _get_paddle_place(place)
+        self._scope = scope
+        assert self._scope != None, "scope must not be None."
+        assert self._place != None, "place must not be None."
+    def apply(self, graph):
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        fake_quant_dequant_ops = []
+        for op in graph.all_op_nodes():
+            if op.name() in _fake_quant_dequant_op_list:
+                fake_quant_dequant_ops.append(op)
+        for _op in fake_quant_dequant_ops:
+            self._replace_op(graph, _op)
+            graph.safe_remove_nodes(_op)
+        graph.resolve_hazard()
+        return graph
+    def _replace_op(self, graph, op):
+        x_node = graph._find_node_by_name(op.inputs, op.input("X")[0])
+        out_node = graph._find_node_by_name(op.outputs, op.output("Out")[0])
+        scale_node = graph._find_node_by_name(op.outputs,
+                                              op.output("OutScale")[0])
+        quant_axis = op.op().attr("quant_axis") if op.op().has_attr(
+            "quant_axis") else -1
+        bit_length = op.op().attr("bit_length") if op.op().has_attr(
+            "bit_length") else 8
+        zero_point_node = None
+        quanted_node = x_node
+        if zero_point_node is None:
+            zero_point_node = graph.create_persistable_node(
+                name=self._zero_point_name(quanted_node.name()),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=scale_node.shape(),
+                var_dtype=core.VarDesc.VarType.INT32)
+            _init_var_node(
+                zero_point_node,
+                np.zeros(
+                    scale_node.shape(), dtype="int32"),
+                self._scope,
+                self._place)
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(x_node.name()),
+            var_type=x_node.type(),
+            shape=x_node.shape(),
+            var_dtype=x_node.dtype())
+        quant_op_node = graph.create_op_node(
+            op_type="quantize_linear",
+            attrs={"quant_axis": quant_axis,
+                   "bit_length": bit_length},
+            inputs={
+                "X": x_node,
+                "Scale": scale_node,
+                "ZeroPoint": zero_point_node
+            },
+            outputs={"Y": quant_var_node})
+        graph.link_to(x_node, quant_op_node)
+        graph.link_to(scale_node, quant_op_node)
+        if zero_point_node is not None:
+            graph.link_to(zero_point_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        dequant_op_node = graph.create_op_node(
+            op_type="dequantize_linear",
+            attrs={"quant_axis": quant_axis,
+                   "bit_length": bit_length},
+            inputs={
+                "X": quant_var_node,
+                "Scale": scale_node,
+                "ZeroPoint": zero_point_node
+            },
+            outputs={"Y": out_node})
+        graph.link_to(quant_var_node, dequant_op_node)
+        graph.link_to(scale_node, dequant_op_node)
+        if zero_point_node is not None:
+            graph.link_to(zero_point_node, dequant_op_node)
+        graph.link_to(dequant_op_node, out_node)
+    def _quantized_var_name(self, var_name):
+        """
+        Return quantized variable name for the input `var_name`.
+        """
+        return "%s.quantized" % (var_name)
+    def _zero_point_name(self, var_name):
+        """
+        Return the scale name for the var named `var_name`.
+        """
+        return "%s@zero_point" % (var_name)
+class QuantWeightPass(object):
+    """
+    quant weights and remove weights input quantize_linear node. for example:
+    `weight -> quant -> dequant -> conv2d` will be frozen into `weight -> dequant -> conv2d`,
+    and weight will be scaled offline.
+    Args:
+        scope(paddle.Scope): scope is used to get the weight tensor values.
+        place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to restore the weight tensors.
+            If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
+        bias_correction(bool): whether use bias correction for post-training quantization.
+             https://arxiv.org/abs/1810.05723.
+        quant_bits(int, optional): quantization bit number for weight. Default is 8.
+        save_int_weight(bool, optional): Whether the type saving the weight is int. Default is True.
+    Examples:
+        .. code-block:: python
+            # The original graph will be rewrite.
+            import paddle
+            from paddle.fluid.contrib.slim.quantization \
+                import QuantWeightPass
+            from paddle.fluid.contrib.slim.graph import IrGraph
+            from paddle.fluid import core
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            place = paddle.CPUPlace()
+            scope = paddle.static.global_scope()
+            quant_weight_pass = QuantWeightPass(scope, place)
+            quant_weight_pass.apply(graph)
+    """
+    def __init__(self,
+                 scope,
+                 place,
+                 bias_correction=False,
+                 quant_bits=8,
+                 save_int_weight=True):
+        self._place = _get_paddle_place(place)
+        self._scope = scope
+        self._bias_correction = bias_correction
+        self._quant_bits = quant_bits
+        self._save_int_weight = save_int_weight
+        assert self._scope != None, "scope must not be None."
+        assert self._place != None, "place must not be None."
+    def apply(self, graph):
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        fake_quant_ops_for_weight = []
+        fake_quant_ops = [
+            op for op in graph.all_op_nodes() if op.name() == "quantize_linear"
+        ]
+        for _op in fake_quant_ops:
+            x_node = graph._find_node_by_name(_op.inputs, _op.input("X")[0])
+            if x_node.persistable():
+                scale_node = graph._find_node_by_name(_op.inputs,
+                                                      _op.input("Scale")[0])
+                zero_point_node = graph._find_node_by_name(
+                    _op.inputs, _op.input("ZeroPoint")[0])
+                out_node = graph._find_node_by_name(_op.outputs,
+                                                    _op.output("Y")[0])
+                scale_v = self._load_var(scale_node.name())
+                assert scale_v.ndim in [1, 2
+                                        ], "the dim of scale_v should be 1 or 2"
+                if scale_v.ndim == 2:
+                    scale_v = scale_v[0]
+                if scale_v.size == 1 and _op.name() == 'abs_max':
+                    scale_v = scale_v[0]
+                else:
+                    scale_v = scale_v.tolist()
+                param_v = self._load_var(x_node.name())
+                quant_axis = _op.op().attr("quant_axis")
+                bits_length = _op.op().attr("bit_length")
+                quantized_param_v = utils.quant_tensor(param_v.copy(), scale_v,
+                                                       quant_axis, bits_length)
+                if self._bias_correction == True:
+                    quantized_param_v = utils.bias_correction_w(
+                        param_v,
+                        quantized_param_v,
+                        scale_v,
+                        quant_axis,
+                        weight_bits=bits_length)
+                if self._save_int_weight:
+                    # cast weight type to int
+                    if self._quant_bits == 8:
+                        save_weight_dtype = np.int8
+                    quantized_param_v = quantized_param_v.astype(
+                        save_weight_dtype)
+                self._restore_var(x_node.name(), quantized_param_v)
+                for next_op_node in out_node.outputs:
+                    graph.update_input_link(out_node, x_node, next_op_node)
+                graph.safe_remove_nodes(out_node)
+        self._remove_unused_var_nodes(graph)
+    def _remove_unused_var_nodes(self, graph):
+        all_used_vars = set()
+        ops = graph.all_op_nodes()
+        for op_node in ops:
+            for input_node in op_node.inputs:
+                all_used_vars.add(input_node)
+            for output_node in op_node.outputs:
+                all_used_vars.add(output_node)
+        all_used_vars = {n.node for n in all_used_vars}
+        all_unused_vars = {
+            n
+            for n in filter(lambda node: node.node not in all_used_vars,
+                            graph.all_var_nodes())
+        }
+        graph.safe_remove_nodes(all_unused_vars)
+    def _load_var(self, name):
+        return np.array(self._scope.find_var(name).get_tensor())
+    def _restore_var(self, name, array):
+        tensor = self._scope.find_var(name).get_tensor()
+        tensor.set(array, self._place)
--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -13,11 +13,292 @@
 # limitations under the License.
 import numpy as np
+from ....framework import IrNode
+from ....framework import Operator
+_weight_supported_quantizable_op_type = [
+    'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul',
+    'matmul_v2'
+]
+_act_supported_quantizable_op_type = [
+    "pool2d",
+    "elementwise_add",
+    "concat",
+    "softmax",
+    "argmax",
+    "transpose",
+    "equal",
+    "gather",
+    "greater_equal",
+    "greater_than",
+    "less_equal",
+    "less_than",
+    "mean",
+    "not_equal",
+    "reshape",
+    "reshape2",
+    "dropout",
+    "bilinear_interp",
+    "nearest_interp",
+    "trilinear_interp",
+    "slice",
+    "squeeze",
+    "elementwise_sub",
+    "mul",
+    "matmul",
+    "relu",
+    "relu6",
+    "leaky_relu",
+    "tanh",
+    "swish",
+    "scale",
+    "transpose",
+    "transpose2",
+    "sigmoid",
+    "pad2d",
+    "flatten",
+    "flatten2",
+    "batch_norm",
+    "layer_norm",
+    "matmul_v2",
+    "split",
+    "flatten_contiguous_range",
+    "squeeze2",
+    "nearest_interp_v2",
+    "bilinear_interp",
+    "bilinear_interp_v2",
+    "fill_constant_batch_size_like",
+    "arg_max",
+    "abs",
+    "assign",
+    "cast",
+    "clip",
+    "box_coder",
+    "crop",
+    "cumsum",
+    "elementwise_mul",
+    "elementwise_pow",
+    "expand_v2",
+    "fill_any_like",
+    "fill_constant",
+    "gelu",
+    "hard_sigmoid",
+    "hard_swish",
+    "instance_norm",
+    "lookup_table",
+    "lookup_table_v2",
+    "norm",
+    "p_norm",
+    "pad3d",
+    "pow",
+    "prelu",
+    "reduce_mean",
+    "unsqueeze",
+    "unsqueeze2",
+    "logical_and",
+    "logical_not",
+    "meshgrid",
+    "roi_align",
+    "strided_slice",
+    "where",
+    "grid_sampler",
+    "tile",
+    "group_norm",
+    "reduce_sum",
+    "square",
+    "softplus",
+    "shuffle_channel",
+]
+_out_scale_op_list = list(
+    set(_weight_supported_quantizable_op_type +
+        _act_supported_quantizable_op_type))
 _channelwise_quant_axis1_ops = [
    'conv2d_transpose', 'mul', 'matmul', 'matmul_v2'
 ]
+# list op real input and output names, to avoid processing input such as AxisTensor.
+_op_real_in_out_name = {
+    "conv2d": [["Input", "Filter"], ["Output"]],
+    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
+    "conv2d_transpose": [["Input", "Filter"], ["Output"]],
+    "mul": [["X", "Y"], ["Out"]],
+    "matmul": [["X", "Y"], ["Out"]],
+    "matmul_v2": [["X", "Y"], ["Out"]],
+    "pool2d": [["X"], ["Out"]],
+    "elementwise_add": [["X", "Y"], ["Out"]],
+    "concat": [["X"], ["Out"]],
+    "softmax": [["X"], ["Out"]],
+    "argmax": [["X"], ["Out"]],
+    "transpose": [["X"], ["Out"]],
+    "equal": [["X", "Y"], ["Out"]],
+    "gather": [["X"], ["Out"]],
+    "greater_equal": [["X", "Y"], ["Out"]],
+    "greater_than": [["X", "Y"], ["Out"]],
+    "less_equal": [["X", "Y"], ["Out"]],
+    "less_than": [["X", "Y"], ["Out"]],
+    "mean": [["X"], ["Out"]],
+    "not_equal": [["X", "Y"], ["Out"]],
+    "reshape": [["X"], ["Out"]],
+    "reshape2": [["X"], ["Out"]],
+    "transpose2": [["X"], ["Out"]],
+    "bilinear_interp": [["X"], ["Out"]],
+    "nearest_interp": [["X"], ["Out"]],
+    "trilinear_interp": [["X"], ["Out"]],
+    "slice": [["Input"], ["Out"]],
+    "squeeze": [["X"], ["Out"]],
+    "elementwise_sub": [["X", "Y"], ["Out"]],
+    "relu": [["X"], ["Out"]],
+    "relu6": [["X"], ["Out"]],
+    "leaky_relu": [["X"], ["Out"]],
+    "prelu": [["X", "Alpha"], ["Out"]],
+    "tanh": [["X"], ["Out"]],
+    "swish": [["X"], ["Out"]],
+    "dropout": [["X"], ["Out"]],
+    "batch_norm": [["X"], ["Y"]],
+    "layer_norm": [["X"], ["Y"]],
+    "sigmoid": [["X"], ["Out"]],
+    "elementwise_mul": [["X", "Y"], ["Out"]],
+    "elementwise_pow": [["X", "Y"], ["Out"]],
+    "scale": [["X"], ["Out"]],
+    "hard_swish": [["X"], ["Out"]],
+    "hard_sigmoid": [["X"], ["Out"]],
+    "gru": [["Input", "Weight"], ["Hidden"]],
+    "lstm": [["Input", "Weight"], ["Hidden"]],
+    "pad2d": [["X"], ["Out"]],
+    "pad3d": [["X"], ["Out"]],
+    "flatten": [["X"], ["Out"]],
+    "flatten2": [["X"], ["Out"]],
+    "unsqueeze2": [["X"], ["Out"]],
+    "unsqueeze2": [["X"], ["Out"]],
+    "flatten_contiguous_range": [["X"], ["Out"]],
+    "split": [["X"], ["Out"]],
+    "squeeze2": [["X"], ["Out"]],
+    "nearest_interp_v2": [["X"], ["Out"]],
+    "bilinear_interp": [["X"], ["Out"]],
+    "bilinear_interp_v2": [["X"], ["Out"]],
+    "fill_constant_batch_size_like": [["Input"], ["Out"]],
+    "arg_max": [["X"], ["Out"]],
+    "abs": [["X"], ["Out"]],
+    "assign": [["X"], ["Out"]],
+    "cast": [["X"], ["Out"]],
+    "clip": [["X"], ["Out"]],
+    "box_coder": [["PriorBox"], ["OutputBox"]],
+    "crop": [["X"], ["Out"]],
+    "cumsum": [["X"], ["Out"]],
+    "expand_v2": [["X"], ["Out"]],
+    "fill_any_like": [["X"], ["Out"]],
+    "fill_constant": [[], ["Out"]],
+    "gelu": [["X"], ["Out"]],
+    "instance_norm": [["X"], ["Out"]],
+    "lookup_table": [["W", "Ids"], ["Out"]],
+    "lookup_table_v2": [["W", "Ids"], ["Out"]],
+    "norm": [["X"], ["Norm"]],
+    "p_norm": [["X"], ["Out"]],
+    "pow": [["X"], ["Out"]],
+    "reduce_mean": [["X"], ["Out"]],
+    "stack": [["X"], ["Y"]],
+    "top_k_v2": [["X"], ["Out", "Indices"]],
+    "logical_and": [["X", "Y"], ["Out"]],
+    "logical_not": [["X"], ["Out"]],
+    "meshgrid": [["X"], ["Out"]],
+    "roi_align": [["X", "ROIs"], ["Out"]],
+    "strided_slice": [["Input"], ["Out"]],
+    "where": [["Condition", "X", "Y"], ["Out"]],
+    "grid_sampler": [["X", "Grid"], ["Output"]],
+    "tile": [["X"], ["Out"]],
+    "group_norm": [["X"], ["Y", "Mean", "Variance"]],
+    "reduce_sum": [["X"], ["Out"]],
+    "square": [["X"], ["Out"]],
+    "softplus": [["X"], ["Out"]],
+    "shuffle_channel": [["X"], ["Out"]],
+}
+def _get_op_input_var_names(op):
+    """
+    Get the input var names of the op.
+    Args:
+        op(IrNode, Operator): the input op.
+    Returns:
+        input_var_names or None.
+    """
+    assert isinstance(op, (IrNode, Operator)), \
+        "The input op should be IrNode or Operator."
+    var_names = []
+    op_name = op.name() if isinstance(op, IrNode) \
+        else op.type
+    if op_name not in _op_real_in_out_name:
+        return []
+    name_list = _op_real_in_out_name[op_name][0]
+    for name in name_list:
+        var_name = op.input(name)
+        if isinstance(var_name, list):
+            var_names.extend(var_name)
+        else:
+            var_names.append(var_name)
+    return var_names
+def _get_op_output_var_names(op):
+    """ """
+    assert isinstance(op, (IrNode, Operator)), \
+        "The input op should be IrNode or Operator."
+    var_names = []
+    op_name = op.name() if isinstance(op, IrNode) \
+        else op.type
+    if op_name not in _op_real_in_out_name:
+        return []
+    name_list = _op_real_in_out_name[op_name][1]
+    for name in name_list:
+        var_name = op.output(name)
+        if isinstance(var_name, list):
+            var_names.extend(var_name)
+        else:
+            var_names.append(var_name)
+    return var_names
+def _get_input_name_index(op, input_var_name):
+    """Get the input name and index of the var_name in the op"""
+    assert isinstance(op, (IrNode, Operator)), \
+        "The input op should be IrNode or Operator."
+    op_name = op.name() if isinstance(op, IrNode) \
+        else op.type
+    if op_name not in _op_real_in_out_name:
+        return None
+    res = None
+    for argname in _op_real_in_out_name[op_name][0]:
+        var_names = op.input(argname)
+        for index, name in enumerate(var_names):
+            if name == input_var_name:
+                res = (argname, index)
+    return res
+def _get_output_name_index(op, output_var_name):
+    """Get the output name and index of the var_name in the op"""
+    assert isinstance(op, (IrNode, Operator)), \
+        "The input op should be IrNode or Operator."
+    op_name = op.name() if isinstance(op, IrNode) \
+        else op.type
+    if op_name not in _op_real_in_out_name:
+        return None
+    name_list = _op_real_in_out_name[op_name][1]
+    res = None
+    for name in name_list:
+        var_name = op.output(name)
+        for index, val in enumerate(var_name):
+            if val == output_var_name:
+                res = (name, index)
+    return res
 def load_variable_data(scope, var_name):
    '''
@@ -84,6 +365,46 @@ def dequant_tensor(x, scale, quant_axis=0, weight_bits=8):
    return x
+def bias_correction_w(x, x_quant, scale_v, quant_axis, weight_bits=8):
+    '''
+    Bias correction for weight
+    '''
+    eps = 1e-8
+    bnt = (1 << (weight_bits - 1)) - 1
+    x_dequant = x_quant.copy()
+    if isinstance(scale_v, list):
+        if quant_axis == 0:
+            for i, s in enumerate(scale_v):
+                x_dequant[i] = x_dequant[i] * s / bnt
+            quant_bias = x - x_dequant
+            mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
+            std_orig = x.reshape(x.shape[0], -1).std(-1)
+            std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
+            std_bias = std_orig / (std_quant + eps)
+        else:
+            for i, s in enumerate(scale_v):
+                x_dequant[:, i] = x_quant[:, i] * s / bnt
+            quant_bias = x - x_dequant
+            mean_bias = np.array(
+                [quant_bias[:, i].mean() for i in range(quant_bias.shape[1])])
+            std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
+            std_quant = np.array(
+                [x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
+            std_bias = std_orig / (std_quant + eps)
+    else:
+        x_dequant = x_quant * scale_v / bnt
+        mean_bias = (x - x_dequant).mean()
+        std_bias = x.std() / (x_dequant.std() + eps)
+    if mean_bias.ndim == 1:
+        std_bias = np.resize(std_bias, x.shape)
+        mean_bias = np.resize(mean_bias, x.shape)
+    x_dequant = (mean_bias + x_dequant) * std_bias
+    quantized_param_v = quant_tensor(x_dequant, scale_v, quant_axis,
+                                     weight_bits)
+    return quantized_param_v
 def stable_sigmoid(x):
    sig = np.where(x < 0, np.exp(x) / (1 + np.exp(x)), 1 / (1 + np.exp(-x)))
    return sig

--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -53,7 +53,9 @@ class TestImperativeQat(unittest.TestCase):
    def set_vars(self):
        self.weight_quantize_type = 'abs_max'
        self.activation_quantize_type = 'moving_average_abs_max'
-        print('weight_quantize_type', self.weight_quantize_type)
+        self.onnx_format = False
+        self.check_export_model_accuracy = True
+        self.diff_threshold = 0.01
    def func_qat(self):
        self.set_vars()
@@ -159,9 +161,13 @@ class TestImperativeQat(unittest.TestCase):
            data = next(test_reader())
            test_data = np.array([x[0].reshape(1, 28, 28)
                                  for x in data]).astype('float32')
+            y_data = np.array(
+                [x[1] for x in data]).astype('int64').reshape(-1, 1)
            test_img = fluid.dygraph.to_variable(test_data)
+            label = fluid.dygraph.to_variable(y_data)
            lenet.eval()
-            before_save = lenet(test_img)
+            fp32_out = lenet(test_img)
+            fp32_acc = fluid.layers.accuracy(fp32_out, label).numpy()
        with tempfile.TemporaryDirectory(prefix="qat_save_path_") as tmpdir:
            # save inference quantized model
@@ -171,7 +177,8 @@ class TestImperativeQat(unittest.TestCase):
                input_spec=[
                    paddle.static.InputSpec(
                        shape=[None, 1, 28, 28], dtype='float32')
-                ])
+                ],
+                onnx_format=self.onnx_format)
            print('Quantized model saved in %s' % tmpdir)
            if core.is_compiled_with_cuda():
@@ -185,13 +192,15 @@ class TestImperativeQat(unittest.TestCase):
                 executor=exe,
                 model_filename="lenet" + INFER_MODEL_SUFFIX,
                 params_filename="lenet" + INFER_PARAMS_SUFFIX)
-            after_save, = exe.run(inference_program,
+            quant_out, = exe.run(inference_program,
-                                  feed={feed_target_names[0]: test_data},
+                                 feed={feed_target_names[0]: test_data},
-                                  fetch_list=fetch_targets)
+                                 fetch_list=fetch_targets)
-            # check
+            paddle.disable_static()
-            self.assertTrue(
+            quant_out = fluid.dygraph.to_variable(quant_out)
-                np.allclose(after_save, before_save.numpy()),
+            quant_acc = fluid.layers.accuracy(quant_out, label).numpy()
-                msg='Failed to save the inference quantized model.')
+            paddle.enable_static()
+            delta_value = fp32_acc - quant_acc
+            self.assertLess(delta_value, self.diff_threshold)
    def test_qat(self):
        with _test_eager_guard():
@@ -199,5 +208,13 @@ class TestImperativeQat(unittest.TestCase):
        self.func_qat()
+class TestImperativeQatONNXFormat(unittest.TestCase):
+    def set_vars(self):
+        self.weight_quantize_type = 'abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        self.onnx_format = True
+        self.diff_threshold = 0.025
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -41,6 +41,17 @@ class TestImperativeQatChannelWise(TestImperativeQat):
    def set_vars(self):
        self.weight_quantize_type = 'channel_wise_abs_max'
        self.activation_quantize_type = 'moving_average_abs_max'
+        self.diff_threshold = 0.01
+        self.onnx_format = False
+        print('weight_quantize_type', self.weight_quantize_type)
+class TestImperativeQatChannelWiseONNXFormat(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'channel_wise_abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        self.onnx_format = True
+        self.diff_threshold = 0.025
        print('weight_quantize_type', self.weight_quantize_type)

--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -173,7 +173,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                 is_use_cache_file=False,
                                 is_optimize_model=False,
                                 batch_size=10,
-                                 batch_nums=10):
+                                 batch_nums=10,
+                                 onnx_format=False):
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
@@ -190,14 +191,28 @@ class TestPostTrainingQuantization(unittest.TestCase):
            round_type=round_type,
            is_full_quantize=is_full_quantize,
            optimize_model=is_optimize_model,
+            onnx_format=onnx_format,
            is_use_cache_file=is_use_cache_file)
        ptq.quantize()
        ptq.save_quantized_model(self.int8_model_path)
-    def run_test(self, model_name, model_url, model_md5, data_name, data_url,
+    def run_test(self,
-                 data_md5, algo, round_type, quantizable_op_type,
+                 model_name,
-                 is_full_quantize, is_use_cache_file, is_optimize_model,
+                 model_url,
-                 diff_threshold, infer_iterations, quant_iterations):
+                 model_md5,
+                 data_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 round_type,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 infer_iterations,
+                 quant_iterations,
+                 onnx_format=False):
        fp32_model_path = self.download_model(model_url, model_md5, model_name)
        fp32_model_path = os.path.join(fp32_model_path, model_name)
@@ -211,10 +226,10 @@ class TestPostTrainingQuantization(unittest.TestCase):
        print("Start post training quantization for {0} on {1} samples ...".
              format(model_name, quant_iterations))
-        self.generate_quantized_model(fp32_model_path, data_path, algo,
+        self.generate_quantized_model(
-                                      round_type, quantizable_op_type,
+            fp32_model_path, data_path, algo, round_type, quantizable_op_type,
-                                      is_full_quantize, is_use_cache_file,
+            is_full_quantize, is_use_cache_file, is_optimize_model,
-                                      is_optimize_model, quant_iterations)
+            quant_iterations, onnx_format)
        print("Start INT8 inference for {0} on {1} samples ...".format(
            model_name, infer_iterations))
@@ -278,5 +293,42 @@ class TestPostTrainingKLForMnistAdaround(TestPostTrainingQuantization):
                      diff_threshold, infer_iterations, quant_iterations)
+class TestPostTrainingKLForMnistONNXFormat(TestPostTrainingQuantization):
+    def test_post_training_kl_onnx_format(self):
+        model_name = "nlp_lstm_fp32_model"
+        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
+        model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
+        data_name = "quant_lstm_input_data"
+        data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
+        data_md5 = "add84c754e9b792fea1fbd728d134ab7"
+        algo = "KL"
+        round_type = "round"
+        quantizable_op_type = ["mul", "lstm"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = False
+        diff_threshold = 0.01
+        infer_iterations = 100
+        quant_iterations = 10
+        onnx_format = True
+        self.run_test(
+            model_name,
+            model_url,
+            model_md5,
+            data_name,
+            data_url,
+            data_md5,
+            algo,
+            round_type,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            infer_iterations,
+            quant_iterations,
+            onnx_format=onnx_format)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -116,7 +116,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                 is_use_cache_file=False,
                                 is_optimize_model=False,
                                 batch_size=10,
-                                 batch_nums=10):
+                                 batch_nums=10,
+                                 onnx_format=False):
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
@@ -134,6 +135,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
            round_type=round_type,
            is_full_quantize=is_full_quantize,
            optimize_model=is_optimize_model,
+            onnx_format=onnx_format,
            is_use_cache_file=is_use_cache_file)
        ptq.quantize()
        ptq.save_quantized_model(self.int8_model_path)
@@ -151,7 +153,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
                 diff_threshold,
                 batch_size=10,
                 infer_iterations=10,
-                 quant_iterations=5):
+                 quant_iterations=5,
+                 onnx_format=False):
        origin_model_path = self.download_model(data_url, data_md5, model_name)
        origin_model_path = os.path.join(origin_model_path, model_name)
@@ -166,7 +169,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
        self.generate_quantized_model(origin_model_path, algo, round_type,
                                      quantizable_op_type, is_full_quantize,
                                      is_use_cache_file, is_optimize_model,
-                                      batch_size, quant_iterations)
+                                      batch_size, quant_iterations, onnx_format)
        print("Start INT8 inference for {0} on {1} images ...".format(
            model_name, infer_iterations * batch_size))
@@ -335,5 +338,72 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
                      infer_iterations, quant_iterations)
+class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
+    def test_post_training_mse_onnx_format(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "mse"
+        round_type = "round"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        onnx_format = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(
+            model_name,
+            data_url,
+            data_md5,
+            algo,
+            round_type,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_size,
+            infer_iterations,
+            quant_iterations,
+            onnx_format=onnx_format)
+class TestPostTrainingmseForMnistONNXFormatFullQuant(
+        TestPostTrainingQuantization):
+    def test_post_training_mse_onnx_format_full_quant(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "mse"
+        round_type = "round"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = True
+        is_use_cache_file = False
+        is_optimize_model = False
+        onnx_format = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(
+            model_name,
+            data_url,
+            data_md5,
+            algo,
+            round_type,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_size,
+            infer_iterations,
+            quant_iterations,
+            onnx_format=onnx_format)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -243,7 +243,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
                                 round_type="round",
                                 is_full_quantize=False,
                                 is_use_cache_file=False,
-                                 is_optimize_model=False):
+                                 is_optimize_model=False,
+                                 onnx_format=False):
        try:
            os.system("mkdir " + self.int8_model)
        except Exception as e:
@@ -265,13 +266,23 @@ class TestPostTrainingQuantization(unittest.TestCase):
            round_type=round_type,
            is_full_quantize=is_full_quantize,
            optimize_model=is_optimize_model,
+            onnx_format=onnx_format,
            is_use_cache_file=is_use_cache_file)
        ptq.quantize()
        ptq.save_quantized_model(self.int8_model)
-    def run_test(self, model, algo, round_type, data_urls, data_md5s,
+    def run_test(self,
-                 quantizable_op_type, is_full_quantize, is_use_cache_file,
+                 model,
-                 is_optimize_model, diff_threshold):
+                 algo,
+                 round_type,
+                 data_urls,
+                 data_md5s,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 onnx_format=False):
        infer_iterations = self.infer_iterations
        batch_size = self.batch_size
        sample_iterations = self.sample_iterations
@@ -285,9 +296,10 @@ class TestPostTrainingQuantization(unittest.TestCase):
        print("Start INT8 post training quantization for {0} on {1} images ...".
              format(model, sample_iterations * batch_size))
-        self.generate_quantized_model(
+        self.generate_quantized_model(model_cache_folder + "/model",
-            model_cache_folder + "/model", quantizable_op_type, algo,
+                                      quantizable_op_type, algo, round_type,
-            round_type, is_full_quantize, is_use_cache_file, is_optimize_model)
+                                      is_full_quantize, is_use_cache_file,
+                                      is_optimize_model, onnx_format)
        print("Start INT8 inference for {0} on {1} images ...".format(
            model, infer_iterations * batch_size))
@@ -517,5 +529,38 @@ class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
                      is_optimize_model, diff_threshold)
+class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_onnx_format_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "avg"
+        round_type = "round"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        onnx_format = True
+        diff_threshold = 0.05
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            onnx_format=onnx_format)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -39,5 +39,34 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
                      is_optimize_model, diff_threshold)
+class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
+    def test_post_training_resnet50(self):
+        model = "ResNet-50"
+        algo = "min_max"
+        round_type = "round"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        ]
+        data_md5s = ['4a5194524823d9b76da6e738e1367881']
+        quantizable_op_type = ["conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = False
+        diff_threshold = 0.025
+        onnx_format = True
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            onnx_format=onnx_format)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -21,6 +21,7 @@ import six
 import paddle
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPassV2
 from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
 from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
 from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
@@ -686,5 +687,129 @@ class TestAddQuantDequantPass(unittest.TestCase):
            for_ci=True)
+class TestQuantizationTransformPassV2(unittest.TestCase):
+    def setUp(self):
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_grad_op_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+    def check_program(self, program):
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        quantized_ops.add(arg_name)
+            for op in block.ops:
+                # check backward
+                if op.type in self.quantizable_grad_op_inputs:
+                    for pname in self.quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+    def linear_fc_quant(self,
+                        activation_quant_type,
+                        weight_quantize_type,
+                        for_ci=True):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        place = fluid.CPUPlace()
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPassV2(
+            scope=fluid.global_scope(),
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quantize_type)
+        transform_pass.apply(graph)
+        if not for_ci:
+            marked_nodes = set()
+            for op in graph.all_op_nodes():
+                if op.name().find('quantize') > -1:
+                    marked_nodes.add(op)
+            graph.draw('.', 'quantize_fc_' + activation_quant_type,
+                       marked_nodes)
+        program = graph.to_program()
+        self.check_program(program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        if not for_ci:
+            val_marked_nodes = set()
+            for op in val_graph.all_op_nodes():
+                if op.name().find('quantize') > -1:
+                    val_marked_nodes.add(op)
+            val_graph.draw('.', 'val_fc_' + activation_quant_type,
+                           val_marked_nodes)
+    def test_linear_fc_quant_abs_max(self):
+        self.linear_fc_quant('abs_max', 'abs_max', for_ci=True)
+    def test_linear_fc_quant_channel_wise_abs_max(self):
+        self.linear_fc_quant('abs_max', 'channel_wise_abs_max', for_ci=True)
+    def residual_block_quant(self,
+                             activation_quant_type,
+                             weight_quantize_type,
+                             quantizable_op_type,
+                             for_ci=True):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        place = fluid.CPUPlace()
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quantize_type,
+            quantizable_op_type=quantizable_op_type)
+        transform_pass.apply(graph)
+        if not for_ci:
+            marked_nodes = set()
+            for op in graph.all_op_nodes():
+                if op.name().find('quantize') > -1:
+                    marked_nodes.add(op)
+            graph.draw('.', 'quantize_residual_' + activation_quant_type,
+                       marked_nodes)
+        program = graph.to_program()
+        self.check_program(program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        if not for_ci:
+            val_marked_nodes = set()
+            for op in val_graph.all_op_nodes():
+                if op.name().find('quantize') > -1:
+                    val_marked_nodes.add(op)
+            val_graph.draw('.', 'val_residual_' + activation_quant_type,
+                           val_marked_nodes)
+    def test_residual_block_abs_max(self):
+        quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
+        self.residual_block_quant(
+            'abs_max', 'abs_max', quantizable_op_type, for_ci=True)
+    def test_residual_block_channel_wise_abs_max(self):
+        quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
+        self.residual_block_quant(
+            'abs_max', 'channel_wise_abs_max', quantizable_op_type, for_ci=True)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -172,5 +172,83 @@ class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
        self.data_type = "float32"
+class TestChannelWiseDequantizeOp(OpTest):
+    def set_args(self):
+        self.bit_length = 8
+        self.data_type = "float32"
+        self.quant_axis = 0
+    def setUp(self):
+        self.set_args()
+        self.op_type = "dequantize_linear"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scale = channel_wise_quantize_max_abs(x, self.bit_length,
+                                                  self.quant_axis)
+        ydq = channel_wise_dequantize_max_abs(yq, scale, self.bit_length,
+                                              self.quant_axis)
+        scale = np.array(scale).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        print('TestChannelWiseDequantizeOp:')
+        self.inputs = {'X': yq, 'Scale': scale, 'ZeroPoint': zero_point}
+        self.attrs = {
+            'bit_length': self.bit_length,
+            'quant_axis': self.quant_axis
+        }
+        self.outputs = {'Y': ydq}
+    def test_check_output(self):
+        self.check_output()
+class TestChannelWiseDequantizeOp1(TestChannelWiseDequantizeOp):
+    def set_args(self):
+        self.bit_length = 8
+        self.data_type = "float32"
+        self.quant_axis = 1
+class TestDequantizeOp(OpTest):
+    def set_args(self):
+        self.bit_length = 8
+        self.quant_axis = -1
+        self.max_range = math.pow(2, self.bit_length - 1) - 1
+        self.data_type = "float32"
+    def setUp(self):
+        self.set_args()
+        self.op_type = "dequantize_linear"
+        x = np.random.randn(31, 65).astype(self.data_type)
+        yq, scale = quantize_max_abs(x, self.max_range)
+        ydq = dequantize_max_abs(yq, scale, self.max_range)
+        scale = np.array(scale).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        self.inputs = {'X': yq, 'Scale': scale, 'ZeroPoint': zero_point}
+        self.attrs = {
+            'bit_length': self.bit_length,
+            'quant_axis': self.quant_axis
+        }
+        self.outputs = {'Y': ydq}
+    def test_check_output(self):
+        self.check_output()
+class TestDequantizeOpDouble(TestDequantizeOp):
+    def set_args(self):
+        self.bit_length = 8
+        self.max_range = math.pow(2, self.bit_length - 1) - 1
+        self.data_type = "float64"
+        self.quant_axis = -1
+class TestDequantizeOp5Bits(TestDequantizeOp):
+    def set_args(self):
+        self.bit_length = 5
+        self.max_range = math.pow(2, self.bit_length - 1) - 1
+        self.data_type = "float32"
+        self.quant_axis = -1
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
+import math
 from op_test import OpTest
 import paddle.fluid.core as core
@@ -374,5 +375,144 @@ class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+def quantize_max_abs(x, max_range):
+    scale = np.max(np.abs(x).flatten())
+    y = np.round(x / scale * max_range)
+    return y, scale
+def channel_wise_quantize_max_abs(x, quant_bit=8, quant_axis=0):
+    assert quant_axis in [0, 1], "The quant_axis should be 0 or 1."
+    scales = []
+    y = x.copy()
+    max_range = math.pow(2, quant_bit - 1) - 1
+    if quant_axis == 0:
+        for i in range(x.shape[0]):
+            scale = np.max(np.abs(x[i])).astype("float32")
+            scales.append(scale)
+            y[i] = np.round(x[i] * max_range / scale)
+    elif quant_axis == 1:
+        for i in range(x.shape[1]):
+            scale = np.max(np.abs(x[:, i])).astype("float32")
+            scales.append(scale)
+            y[:, i] = np.round(x[:, i] * max_range / scale)
+    return y, scales
+class TestChannelWiseQuantizeOp(OpTest):
+    def set_args(self):
+        self.bit_length = 8
+        self.data_type = "float32"
+        self.quant_axis = 0
+    def setUp(self):
+        self.set_args()
+        self.op_type = "quantize_linear"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scale = channel_wise_quantize_max_abs(x, self.bit_length,
+                                                  self.quant_axis)
+        scale = np.array(scale).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
+        self.attrs = {
+            'bit_length': self.bit_length,
+            'quant_axis': self.quant_axis
+        }
+        self.outputs = {'Y': yq}
+    def test_check_output(self):
+        self.check_output()
+class TestChannelWiseQuantizeOp1(TestChannelWiseQuantizeOp):
+    def set_args(self):
+        self.bit_length = 8
+        self.data_type = "float32"
+        self.quant_axis = 1
+class TestChannelWiseQuantizeOpTrain(OpTest):
+    def set_args(self):
+        self.bit_length = 8
+        self.data_type = "float32"
+        self.quant_axis = 0
+        self.is_test = False
+    def setUp(self):
+        self.set_args()
+        self.op_type = "quantize_linear"
+        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        yq, scale = channel_wise_quantize_max_abs(x, self.bit_length,
+                                                  self.quant_axis)
+        scale = np.array(scale).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
+        self.attrs = {
+            'bit_length': self.bit_length,
+            'quant_axis': self.quant_axis,
+            'is_test': self.is_test
+        }
+        self.outputs = {'Y': yq, 'OutScale': scale}
+    def test_check_output(self):
+        self.check_output()
+class TestquantizeOp(OpTest):
+    def set_args(self):
+        self.bit_length = 8
+        self.quant_axis = -1
+        self.max_range = math.pow(2, self.bit_length - 1) - 1
+        self.data_type = "float32"
+    def setUp(self):
+        self.set_args()
+        self.op_type = "quantize_linear"
+        x = np.random.randn(31, 65).astype(self.data_type)
+        yq, scale = quantize_max_abs(x, self.max_range)
+        scale = np.array(scale).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
+        self.attrs = {
+            'bit_length': self.bit_length,
+            'quant_axis': self.quant_axis,
+        }
+        self.outputs = {'Y': yq}
+    def test_check_output(self):
+        self.check_output()
+class TestquantizeOpTrain(TestquantizeOp):
+    def set_args(self):
+        self.bit_length = 8
+        self.quant_axis = -1
+        self.max_range = math.pow(2, self.bit_length - 1) - 1
+        self.data_type = "float32"
+        self.is_test = False
+    def setUp(self):
+        self.set_args()
+        self.op_type = "quantize_linear"
+        x = np.random.randn(31, 65).astype(self.data_type)
+        yq, scale = quantize_max_abs(x, self.max_range)
+        scale = np.array(scale).astype(self.data_type)
+        zero_point = np.zeros(scale.shape, dtype="int32")
+        self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
+        self.attrs = {
+            'bit_length': self.bit_length,
+            'quant_axis': self.quant_axis,
+            'is_test': self.is_test
+        }
+        self.outputs = {'Y': yq, 'OutScale': scale}
+    def test_check_output(self):
+        self.check_output()
 if __name__ == "__main__":
    unittest.main()