未验证 提交 b72a7ebb 编写于 作者: G Guanghua Yu 提交者: GitHub

add new format of quantization (#41041)

上级 b9ee846e
...@@ -102,10 +102,11 @@ endif() ...@@ -102,10 +102,11 @@ endif()
set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel) set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
op_library(quantize_linear_op DEPS cast_kernel)
op_library(save_combine_op DEPS string_array) op_library(save_combine_op DEPS string_array)
op_library(load_combine_op DEPS string_array) op_library(load_combine_op DEPS string_array)
......
...@@ -12,142 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,142 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.cu.h"
#include "paddle/fluid/operators/fake_dequantize_op.h" #include "paddle/fluid/operators/fake_dequantize_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
T* out) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num) {
out[idx] = in[idx] * scale[0] / max_range;
}
}
template <typename T>
struct DequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, framework::Tensor* out) {
const T* in_data = in->data<T>();
const T* scale_factor = scale->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = in->numel();
int block = 512;
int grid = (num + block - 1) / block;
KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, out_data);
}
};
template <typename T>
__global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
T max_range, int num, int channel,
T* out) {
int tid = threadIdx.x;
int channel_size = num / channel;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
for (int i = tid; i < channel_size; i += blockDim.x) {
out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
}
}
template <typename T>
__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
const T max_range,
const int64_t num,
const int n_scales,
const int quant_stride, T* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
T s = scale[(i / quant_stride) % n_scales];
out[i] = in[i] * s / max_range;
}
}
template <typename T>
__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
const T* scale_two, T max_range, int num,
int iter_size, int channel, T* out) {
int tid = threadIdx.x;
int channel_size = num / (iter_size * channel);
int scale_index = blockIdx.x % channel;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
for (int i = tid; i < channel_size; i += blockDim.x) {
out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
}
}
template <typename T>
struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor** scales,
const int scale_num, T max_range, const int quant_axis,
const int x_num_col_dims, framework::Tensor* out) {
auto in_dims = in->dims();
const T* in_data = in->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
if (scale_num == 1) {
int64_t num = in->numel();
const T* scale_factor = scales[0]->data<T>();
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[0], out_data);
} else {
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
int64_t block_size = std::min(
num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(
((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
DequantizeOneScaleQuantAxisN<
T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[quant_axis],
quant_stride, out_data);
}
} else if (scale_num == 2) {
// Not need to consider quant_axis
int num = in->numel();
int iter_size = 1;
for (int i = 0; i < x_num_col_dims; i++) {
iter_size *= in->dims()[i];
}
int channel = in->dims()[x_num_col_dims];
const T* scale_one = scales[0]->data<T>();
const T* scale_two = scales[1]->data<T>();
int block = 1024;
int grid = iter_size * channel;
DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_one, scale_two, max_range, num, iter_size, channel,
out_data);
}
}
};
template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs, REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_FAKE_DEQUANTIZE_OP_CU_H_
#define PADDLE_FLUID_OPERATORS_FAKE_DEQUANTIZE_OP_CU_H_
#endif // PADDLE_FLUID_OPERATORS_FAKE_DEQUANTIZE_OP_CU_H_
#include "paddle/fluid/operators/fake_dequantize_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void KeDequantize(const T* in, const T* scale, T max_range,
int64_t num, T* out) {
int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
out[i] = in[i] * scale[0] / max_range;
}
}
template <typename T>
struct DequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, framework::Tensor* out) {
const T* in_data = in->data<T>();
const T* scale_factor = scale->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int64_t num = in->numel();
int64_t block_size = std::min(
num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks =
std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
KeDequantize<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, out_data);
}
};
template <typename T>
__global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
T max_range, int num, int channel,
T* out) {
int tid = threadIdx.x;
int channel_size = num / channel;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
for (int i = tid; i < channel_size; i += blockDim.x) {
out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
}
}
template <typename T>
__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
const T max_range,
const int64_t num,
const int n_scales,
const int quant_stride, T* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
T s = scale[(i / quant_stride) % n_scales];
out[i] = in[i] * s / max_range;
}
}
template <typename T>
__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
const T* scale_two, T max_range, int num,
int iter_size, int channel, T* out) {
int tid = threadIdx.x;
int channel_size = num / (iter_size * channel);
int scale_index = blockIdx.x % channel;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
for (int i = tid; i < channel_size; i += blockDim.x) {
out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
}
}
template <typename T>
struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor** scales,
const int scale_num, T max_range, const int quant_axis,
const int x_num_col_dims, framework::Tensor* out) {
auto in_dims = in->dims();
const T* in_data = in->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
if (scale_num == 1) {
int64_t num = in->numel();
const T* scale_factor = scales[0]->data<T>();
int64_t block_size = std::min(
num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
DequantizeOneScaleQuantAxisN<
T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[quant_axis],
quant_stride, out_data);
} else if (scale_num == 2) {
// Not need to consider quant_axis
int num = in->numel();
int iter_size = 1;
for (int i = 0; i < x_num_col_dims; i++) {
iter_size *= in->dims()[i];
}
int channel = in->dims()[x_num_col_dims];
const T* scale_one = scales[0]->data<T>();
const T* scale_two = scales[1]->data<T>();
int block = 1024;
int grid = iter_size * channel;
DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_one, scale_two, max_range, num, iter_size, channel,
out_data);
}
}
};
template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
...@@ -12,531 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,531 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string> #include "paddle/fluid/operators/fake_quantize_op.cu.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_quantize_op.h" #include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
extern __shared__ char* shared_max_data_tmp[];
auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
if (gridDim.x > 1) {
T local_max_data = T(0);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T tmp = abs(in[i]);
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
} else {
if (bid < n) {
shared_max_data[tid] = abs(in[bid]);
} else {
shared_max_data[tid] = T(0);
}
}
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
shared_max_data[tid] = shared_max_data[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[blockIdx.x] = shared_max_data[0];
}
}
template <typename T>
struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx, const T* in,
const int num, T* out) {
int block = 1024;
int grid = (block - 1 + num) / block;
grid = (grid > block) ? block : grid;
framework::Tensor max;
T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
in, num, max_data);
FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
max_data, grid, out);
}
};
template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
paddle::platform::float16>;
template <typename T>
__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
const int c, T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
extern __shared__ T shared_max_data[];
T local_max_data = T(0);
for (int i = tid; i < channel_size; i += blockDim.x) {
T tmp = fabs(in_c[i]);
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
shared_max_data[tid] = shared_max_data[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[blockIdx.x] = shared_max_data[0];
}
}
template <typename T>
__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
const int cin, const int cout,
T* out) {
extern __shared__ T shared_max_data[];
int cout_wh_size = n / cin;
int wh_size = n / (cin * cout);
int tid = threadIdx.x;
int bid = blockIdx.x;
const T* in_current = in + tid * cout_wh_size + bid * wh_size;
T local_max_data = T(0);
for (int i = 0; i < wh_size; i++) {
T tmp = fabs(in_current[i]);
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
int len = blockDim.x;
for (int i = (len + 1) / 2; i > 0; len = i, i = (i + 1) / 2) {
if (tid < i && tid + i < len &&
shared_max_data[tid] < shared_max_data[tid + i]) {
shared_max_data[tid] = shared_max_data[tid + i];
}
if (i == 1) {
i = 0; // break the loop
}
__syncthreads();
}
if (tid == 0 && shared_max_data[0] > out[bid]) {
out[bid] = shared_max_data[0];
}
}
template <typename T>
struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in_tensor, const int quant_axis,
T* out_abs_max) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
const int num = in_tensor.numel();
auto in_dims = in_tensor.dims();
const T* in_data = in_tensor.data<T>();
if (quant_axis == 0) {
int cout = in_dims[0];
int grid = cout;
int block = 1024;
FindChannelAbsMaxKernelQuantAxis0<
T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
in_data, num, cout, out_abs_max);
} else if (quant_axis == 1) {
int cin = in_dims[0];
int cout = in_dims[1];
int grid = cout;
int max_threads = 1024;
#ifdef PADDLE_WITH_HIP
hipMemset(out_abs_max, 0, sizeof(T) * cout);
#else
cudaMemset(out_abs_max, 0, sizeof(T) * cout);
#endif
for (int i = 0; i < cin / max_threads; i++) {
int block = max_threads;
FindChannelAbsMaxKernelQuantAxis1<
T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
in_data, num, cin, cout, out_abs_max);
in_data += num / cin;
}
int block = cin % max_threads;
if (block > 0) {
FindChannelAbsMaxKernelQuantAxis1<
T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
in_data, num, in_dims[0], in_dims[1], out_abs_max);
}
}
}
};
template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
__global__ void ClipAndQuantKernel(const T* in, const T* scale,
const int bin_cnt, const int n, T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
T s = scale[0];
T inv_s = inverse(s);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T x = in[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out[i] = round(v);
}
}
template <typename T>
__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
const int bin_cnt, const int n,
T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
T s = scale[0];
T inv_s = inverse(s);
T bin_cnt_t = static_cast<T>(bin_cnt);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T x = in[i];
x = x > s ? s : x;
x = x < -s ? -s : x;
x = bin_cnt_t * inv_s * x;
x = static_cast<T>(round(static_cast<float>(x)));
out[i] = (x * s) / bin_cnt_t;
}
}
template <typename T>
struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = (block - 1 + num) / block;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, out_data);
}
};
template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
template <typename T>
struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = (block - 1 + num) / block;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, out_data);
}
};
// ChannelClipAndQuantKernel for quant_axis is 0
template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
const int bin_cnt,
const int64_t n,
const int c, T* out) {
int tid = threadIdx.x;
int64_t channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int64_t i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v);
}
}
// ChannelClipAndQuantKernel for quant_axis is N
template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxisN(
const T* in, const T* scale, const int bin_cnt, const int64_t n,
const int nScale, const int quant_stride, T* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
T s = scale[(i / quant_stride) % nScale];
T inv_s = 1.0 / s;
T x = in[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out[i] = round(v);
}
}
template <typename T>
struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int64_t num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
} else {
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
int64_t block_size =
std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
out_data);
}
}
};
template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
float>;
template <typename T>
__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
const T* last_scale,
const int64_t* iter,
const int window_size, T* scale_arr,
T* out_scale, int* need_find_max,
int* out_size) {
int it = iter[0];
int idx = it % window_size;
T removed = scale_arr[idx];
T cur = cur_scale[0];
scale_arr[idx] = cur;
T max = last_scale[0];
out_scale[0] = max < cur ? cur : max;
if (fabs(removed - max) < 1e-6) {
need_find_max[0] = 1;
out_size[0] = it > window_size ? window_size : it;
} else {
need_find_max[0] = 0;
}
}
template <typename T>
struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& cur_scale,
const framework::Tensor& last_scale,
const framework::Tensor& iter, const int window_size,
framework::Tensor* scales_arr, framework::Tensor* out_scale) {
const auto gpu_place = ctx.GetPlace();
T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
framework::Tensor need_find_max, out_size;
int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
window_size, scale_arr, out_scale_data, find_max, out_size_data);
int g_find_max;
memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
sizeof(int), ctx.stream());
ctx.Wait();
if (g_find_max) {
int len;
memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
sizeof(int), ctx.stream());
ctx.Wait();
FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
out_scale_data);
}
}
};
template <typename T>
__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
const T* in_accum,
const T* cur_scale, const T rate,
T* out_state, T* out_accum,
T* out_scale) {
T state = rate * (*in_state) + T(1.0f);
T accum = rate * (*in_accum) + (*cur_scale);
*out_state = state;
*out_accum = accum;
*out_scale = accum / state;
}
template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in_accum,
const framework::Tensor& in_state, const T* cur_scale,
const float rate, framework::Tensor* out_state,
framework::Tensor* out_accum, framework::Tensor* out_scale) {
const auto gpu_place = ctx.GetPlace();
T rate_t = static_cast<T>(rate);
T* out_state_data = out_state->mutable_data<T>(gpu_place);
T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
out_state_data, out_accum_data, out_scale_data);
}
};
// ChannelClipAndQuantDequantKernel for quant_axis is 0
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
const T* in, const T* scale, const int bin_cnt, const int n, const int c,
T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
// ChannelClipAndQuantDequantKernel for quant_axis is 1
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
const int cout, T* out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);
int wh_size = n / (cin * cout);
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
// At present, channelwise quantization supports conv2d, depthwise_conv2d
// conv2d_transpose and mul
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis0<
T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
num, in_dims[0], out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis1<
T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
float>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
#define PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
#endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
#include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
extern __shared__ char* shared_max_data_tmp[];
auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
if (gridDim.x > 1) {
T local_max_data = T(0);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T tmp = abs(in[i]);
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
} else {
if (bid < n) {
shared_max_data[tid] = abs(in[bid]);
} else {
shared_max_data[tid] = T(0);
}
}
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
shared_max_data[tid] = shared_max_data[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[blockIdx.x] = shared_max_data[0];
}
}
template <typename T>
struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx, const T* in,
const int num, T* out) {
int block = 1024;
int grid = (block - 1 + num) / block;
grid = (grid > block) ? block : grid;
framework::Tensor max;
T* max_data = max.mutable_data<T>(phi::make_ddim({grid}), ctx.GetPlace());
FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
in, num, max_data);
FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
max_data, grid, out);
}
};
template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
paddle::platform::float16>;
template <typename T>
__global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
const int c, T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
extern __shared__ T shared_max_data[];
T local_max_data = T(0);
for (int i = tid; i < channel_size; i += blockDim.x) {
T tmp = fabs(in_c[i]);
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
for (int i = blockDim.x / 2; i > 0; i >>= 1) {
if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
shared_max_data[tid] = shared_max_data[tid + i];
}
__syncthreads();
}
if (tid == 0) {
out[blockIdx.x] = shared_max_data[0];
}
}
template <typename T>
__global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
const int cin, const int cout,
T* out) {
extern __shared__ T shared_max_data[];
int cout_wh_size = n / cin;
int wh_size = n / (cin * cout);
int tid = threadIdx.x;
int bid = blockIdx.x;
const T* in_current = in + tid * cout_wh_size + bid * wh_size;
T local_max_data = T(0);
for (int i = 0; i < wh_size; i++) {
T tmp = fabs(in_current[i]);
if (tmp > local_max_data) {
local_max_data = tmp;
}
}
shared_max_data[tid] = local_max_data;
__syncthreads();
int len = blockDim.x;
for (int i = (len + 1) / 2; i > 0; len = i, i = (i + 1) / 2) {
if (tid < i && tid + i < len &&
shared_max_data[tid] < shared_max_data[tid + i]) {
shared_max_data[tid] = shared_max_data[tid + i];
}
if (i == 1) {
i = 0; // break the loop
}
__syncthreads();
}
if (tid == 0 && shared_max_data[0] > out[bid]) {
out[bid] = shared_max_data[0];
}
}
template <typename T>
struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in_tensor, const int quant_axis,
T* out_abs_max) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
const int num = in_tensor.numel();
auto in_dims = in_tensor.dims();
const T* in_data = in_tensor.data<T>();
if (quant_axis == 0) {
int cout = in_dims[0];
int grid = cout;
int block = 1024;
FindChannelAbsMaxKernelQuantAxis0<
T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
in_data, num, cout, out_abs_max);
} else if (quant_axis == 1) {
int cin = in_dims[0];
int cout = in_dims[1];
int grid = cout;
int max_threads = 1024;
#ifdef PADDLE_WITH_HIP
hipMemset(out_abs_max, 0, sizeof(T) * cout);
#else
cudaMemset(out_abs_max, 0, sizeof(T) * cout);
#endif // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
for (int i = 0; i < cin / max_threads; i++) {
int block = max_threads;
FindChannelAbsMaxKernelQuantAxis1<
T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
in_data, num, cin, cout, out_abs_max);
in_data += num / cin;
}
int block = cin % max_threads;
if (block > 0) {
FindChannelAbsMaxKernelQuantAxis1<
T><<<grid, block, block * sizeof(T), ctx.stream()>>>(
in_data, num, in_dims[0], in_dims[1], out_abs_max);
}
}
}
};
template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
__global__ void ClipAndQuantKernel(const T* in, const T* scale,
const int bin_cnt, const int n, T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
T s = scale[0];
T inv_s = inverse(s);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T x = in[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out[i] = round(v);
}
}
template <typename T>
__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
const int bin_cnt, const int n,
T* out) {
int bid = threadIdx.x + blockIdx.x * blockDim.x;
int tid = threadIdx.x;
T s = scale[0];
T inv_s = inverse(s);
T bin_cnt_t = static_cast<T>(bin_cnt);
for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
T x = in[i];
x = x > s ? s : x;
x = x < -s ? -s : x;
x = bin_cnt_t * inv_s * x;
x = static_cast<T>(round(static_cast<float>(x)));
out[i] = (x * s) / bin_cnt_t;
}
}
template <typename T>
struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = (block - 1 + num) / block;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, out_data);
}
};
template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
template <typename T>
struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, framework::Tensor* out) {
int num = in.numel();
int block = 1024;
int grid = (block - 1 + num) / block;
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, out_data);
}
};
// ChannelClipAndQuantKernel for quant_axis is 0
template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
const int bin_cnt,
const int64_t n,
const int c, T* out) {
int tid = threadIdx.x;
int64_t channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int64_t i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v);
}
}
// ChannelClipAndQuantKernel for quant_axis is N
template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxisN(
const T* in, const T* scale, const int bin_cnt, const int64_t n,
const int nScale, const int quant_stride, T* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
T s = scale[(i / quant_stride) % nScale];
T inv_s = 1.0 / s;
T x = in[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out[i] = round(v);
}
}
template <typename T>
struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int64_t num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
} else {
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
int64_t block_size =
std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
out_data);
}
}
};
template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
float>;
template <typename T>
__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
const T* last_scale,
const int64_t* iter,
const int window_size, T* scale_arr,
T* out_scale, int* need_find_max,
int* out_size) {
int it = iter[0];
int idx = it % window_size;
T removed = scale_arr[idx];
T cur = cur_scale[0];
scale_arr[idx] = cur;
T max = last_scale[0];
out_scale[0] = max < cur ? cur : max;
if (fabs(removed - max) < 1e-6) {
need_find_max[0] = 1;
out_size[0] = it > window_size ? window_size : it;
} else {
need_find_max[0] = 0;
}
}
template <typename T>
struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& cur_scale,
const framework::Tensor& last_scale,
const framework::Tensor& iter, const int window_size,
framework::Tensor* scales_arr, framework::Tensor* out_scale) {
const auto gpu_place = ctx.GetPlace();
T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
framework::Tensor need_find_max, out_size;
int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
window_size, scale_arr, out_scale_data, find_max, out_size_data);
int g_find_max;
memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
sizeof(int), ctx.stream());
ctx.Wait();
if (g_find_max) {
int len;
memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
sizeof(int), ctx.stream());
ctx.Wait();
FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
out_scale_data);
}
}
};
template <typename T>
__global__ void FindMovingAverageAbsMaxKernel(const T* in_state,
const T* in_accum,
const T* cur_scale, const T rate,
T* out_state, T* out_accum,
T* out_scale) {
T state = rate * (*in_state) + T(1.0f);
T accum = rate * (*in_accum) + (*cur_scale);
*out_state = state;
*out_accum = accum;
*out_scale = accum / state;
}
template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
template <typename T>
struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in_accum,
const framework::Tensor& in_state, const T* cur_scale,
const float rate, framework::Tensor* out_state,
framework::Tensor* out_accum, framework::Tensor* out_scale) {
const auto gpu_place = ctx.GetPlace();
T rate_t = static_cast<T>(rate);
T* out_state_data = out_state->mutable_data<T>(gpu_place);
T* out_accum_data = out_accum->mutable_data<T>(gpu_place);
T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
FindMovingAverageAbsMaxKernel<T><<<1, 1, 0, ctx.stream()>>>(
in_state.data<T>(), in_accum.data<T>(), cur_scale, rate_t,
out_state_data, out_accum_data, out_scale_data);
}
};
// ChannelClipAndQuantDequantKernel for quant_axis is 0
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
const T* in, const T* scale, const int bin_cnt, const int n, const int c,
T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
// ChannelClipAndQuantDequantKernel for quant_axis is 1
template <typename T>
__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
const int cout, T* out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);
int wh_size = n / (cin * cout);
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v) * s / bin_cnt;
}
}
template <typename T>
struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx,
const framework::Tensor& in, const framework::Tensor& scale,
const int bin_cnt, const int quant_axis,
framework::Tensor* out) {
// At present, channelwise quantization supports conv2d, depthwise_conv2d
// conv2d_transpose and mul
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1, true,
platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
int num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis0<
T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
num, in_dims[0], out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
ChannelClipAndQuantDequantKernelQuantAxis1<
T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
}
}
};
template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
float>;
} // namespace operators
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/quantize_linear_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/transform.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/impl/clip_kernel_impl.h"
namespace paddle {
namespace operators {
template <typename T>
struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, const int quant_axis, framework::Tensor* out) {
// Dequant op is before quantized op
// Dequantize the weight of quantized op
auto in_dims = in->dims();
const int64_t channel = in_dims[quant_axis];
const T* scale_factor = scale->data<T>();
if (quant_axis == 0) {
for (int64_t i = 0; i < channel; i++) {
T s = scale_factor[i];
framework::Tensor one_channel_in = in->Slice(i, i + 1);
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
auto& dev = *dev_ctx.eigen_device();
out_e.device(dev) = in_e * s / max_range;
}
} else if (quant_axis == 1) {
int64_t out_iter = 1;
for (int i = 0; i < quant_axis; i++) {
out_iter *= in_dims[i];
}
int64_t step_i = in->numel() / out_iter;
int64_t step_j = in->numel() / (out_iter * channel);
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
for (int64_t i = 0; i < out_iter; i++) {
for (int64_t j = 0; j < channel; j++) {
auto* cur_in = in_data + i * step_i + j * step_j;
auto* cur_out = out_data + i * step_i + j * step_j;
T s = scale_factor[j];
for (int64_t k = 0; k < step_j; k++) {
*cur_out = (*cur_in) * s / max_range;
++cur_in;
++cur_out;
}
}
}
}
}
};
template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, float>;
template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, double>;
class QuantizeLinearOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "QuantizeLinear");
OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "QuantizeLinear");
OP_INOUT_CHECK(ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint",
"QuantizeLinear");
OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "QuantizeLinear");
ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
int quant_axis = ctx->Attrs().Get<int>("quant_axis");
if (ctx->HasOutput("OutScale")) {
if (quant_axis < 0) {
ctx->SetOutputDim("OutScale", {1});
} else {
ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
}
}
ctx->ShareLoD("X", /*->*/ "Y");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "(Tensor) Input is float data type.");
AddInput("Scale", "(Tensor) Input is float data type.");
AddInput("ZeroPoint", "(Tensor) Input is float data type.");
AddOutput("Y",
"(Tensor) Output of quantized low level tensor, "
"but also saved as float data type.");
AddOutput("OutScale", "(Tensor) Current scale").AsDispensable().AsExtra();
AddAttr<int>("quant_axis",
"(int, default 0) The axis for quantization. "
"For conv2d, depthwise_conv2d, conv2d_transpose "
"and mul, the quant_axis is equal to the cout axis.")
.SetDefault(0)
.AddCustomChecker([](const int& quant_axis) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1 || quant_axis == -1, true,
platform::errors::InvalidArgument(
"'quant_axis' should be 0 or 1, but "
"the received is %d",
quant_axis));
});
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
platform::errors::InvalidArgument(
"'bit_length' should be between 1 and 16, but "
"the received is %d",
bit_length));
});
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(true);
AddComment(R"DOC(
The scale of QuantizeLinear operator is a vector.
In detail, each channel of the input X has a scale value.
$$scale_c = max(abs(X_c))$$
$$range = 2^{bit\_length - 1} - 1$$
$$Out_c = round(\frac{X_c * range} {scale_c})$$
In above three formulas, the range value of c is as follow:
$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(
quantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(quantize_linear, ops::QuantizeLinearKernel<CPU, float>);
REGISTER_OPERATOR(
dequantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(dequantize_linear,
ops::DeQuantizeLinearKernel<CPU, float, float>,
ops::DeQuantizeLinearKernel<CPU, int8_t, float>,
ops::DeQuantizeLinearKernel<CPU, double, double>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/fake_dequantize_op.cu.h"
#include "paddle/fluid/operators/fake_quantize_op.cu.h"
#include "paddle/fluid/operators/quantize_linear_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {
template <typename T>
struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, const int quant_axis, framework::Tensor* out) {
auto in_dims = in->dims();
const T* in_data = in->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int64_t num = in->numel();
const T* scale_factor = scale->data<T>();
int64_t block_size = std::min(
num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks =
std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
DequantizeOneScaleQuantAxisN<
T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[quant_axis],
quant_stride, out_data);
}
};
template struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, float>;
template struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(dequantize_linear,
ops::DeQuantizeLinearKernel<CUDA, float, float>,
ops::DeQuantizeLinearKernel<CUDA, int8_t, float>,
ops::DeQuantizeLinearKernel<CUDA, double, double>);
REGISTER_OP_CUDA_KERNEL(quantize_linear,
ops::QuantizeLinearKernel<CUDA, float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/fake_dequantize_op.h"
#include "paddle/fluid/operators/fake_quantize_op.h"
#include "paddle/fluid/platform/transform.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/cast_kernel.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
struct ChannelDequantizeFunctorV2 {
void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
const framework::Tensor** scales, const int scale_num,
T max_range, const int quant_axis, framework::Tensor* out);
};
template <typename DeviceContext, typename T>
class QuantizeLinearKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<framework::Tensor>("X");
auto* in_scale = context.Input<framework::Tensor>("Scale");
auto* out = context.Output<framework::Tensor>("Y");
out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
int quant_axis = context.Attr<int>("quant_axis");
bool is_test = context.Attr<bool>("is_test");
auto& dev_ctx = context.template device_context<DeviceContext>();
if (quant_axis < 0) {
if (!is_test) {
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_s = out_scale->mutable_data<T>(context.GetPlace());
FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(),
in->numel(), out_s);
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
bin_cnt, out);
} else {
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
bin_cnt, out);
}
} else {
if (!is_test) {
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
out_scale_data);
ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
} else {
ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *in_scale, bin_cnt, quant_axis, out);
}
}
}
};
template <typename DeviceContext, typename T, typename D>
class DeQuantizeLinearKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto& dev_ctx = context.template device_context<DeviceContext>();
auto* in = context.Input<framework::Tensor>("X");
auto in_tmp = phi::Cast<T>(
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*in, experimental::CppTypeToDataType<D>::Type());
auto* scale = context.Input<framework::Tensor>("Scale");
auto* out = context.Output<framework::Tensor>("Y");
int bit_length = context.Attr<int>("bit_length");
auto quant_axis = context.Attr<int>("quant_axis");
out->mutable_data<D>(dev_ctx.GetPlace());
if (quant_axis < 0) {
float max_range = (std::pow(2, bit_length - 1) - 1);
DequantizeFunctor<DeviceContext, D>()(dev_ctx, &in_tmp, scale,
static_cast<D>(max_range), out);
} else {
PADDLE_ENFORCE_EQ(
scale->numel(), in_tmp.dims()[quant_axis],
platform::errors::PreconditionNotMet(
"The number of first scale values must be the same with "
"quant_axis dimension value of Input(X) when the `scale` has "
"only one element, but %ld != %ld here.",
scale->numel(), in_tmp.dims()[quant_axis]));
int max_range = (std::pow(2, bit_length - 1) - 1);
ChannelDequantizeFunctorV2<DeviceContext, D>()(
dev_ctx, &in_tmp, scale, static_cast<D>(max_range), quant_axis, out);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -41,6 +41,7 @@ PD_REGISTER_KERNEL(cast, ...@@ -41,6 +41,7 @@ PD_REGISTER_KERNEL(cast,
int64_t, int64_t,
int16_t, int16_t,
bool, bool,
int8_t,
uint8_t, uint8_t,
phi::dtype::float16, phi::dtype::float16,
phi::dtype::bfloat16, phi::dtype::bfloat16,
......
...@@ -41,6 +41,7 @@ void CastKernel(const Context& dev_ctx, ...@@ -41,6 +41,7 @@ void CastKernel(const Context& dev_ctx,
int64_t, \ int64_t, \
int16_t, \ int16_t, \
bool, \ bool, \
int8_t, \
uint8_t, \ uint8_t, \
phi::dtype::float16, \ phi::dtype::float16, \
phi::dtype::complex<float>, \ phi::dtype::complex<float>, \
......
...@@ -28,6 +28,7 @@ from paddle.fluid.param_attr import ParamAttr ...@@ -28,6 +28,7 @@ from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.initializer import Constant from paddle.fluid.initializer import Constant
from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
from paddle.fluid.io import load_inference_model, save_inference_model from paddle.fluid.io import load_inference_model, save_inference_model
from ..quantization_pass import ReplaceFakeQuantDequantPass, QuantWeightPass
from paddle.fluid.log_helper import get_logger from paddle.fluid.log_helper import get_logger
from .. import quantization_pass from .. import quantization_pass
from . import utils from . import utils
...@@ -431,7 +432,12 @@ class ImperativeQuantizeOutputs(object): ...@@ -431,7 +432,12 @@ class ImperativeQuantizeOutputs(object):
setattr(parent_layer, sub_name, cur_quant_layer) setattr(parent_layer, sub_name, cur_quant_layer)
def save_quantized_model(self, model, path, input_spec=None, **config): def save_quantized_model(self,
model,
path,
input_spec=None,
onnx_format=False,
**config):
""" """
Save the quantized model for the inference. Save the quantized model for the inference.
...@@ -444,6 +450,8 @@ class ImperativeQuantizeOutputs(object): ...@@ -444,6 +450,8 @@ class ImperativeQuantizeOutputs(object):
InputSpec or example Tensor. If None, all input variables of InputSpec or example Tensor. If None, all input variables of
the original Layer's forward method would be the inputs of the original Layer's forward method would be the inputs of
the saved model. Default None. the saved model. Default None.
onnx_format (bool, optional): Whether to export the quantized model
with format of ONNX. Default is False.
**configs (dict, optional): Other save configuration options for **configs (dict, optional): Other save configuration options for
compatibility. We do not recommend using these configurations, compatibility. We do not recommend using these configurations,
they may be removed in the future. If not necessary, DO NOT use they may be removed in the future. If not necessary, DO NOT use
...@@ -498,6 +506,18 @@ class ImperativeQuantizeOutputs(object): ...@@ -498,6 +506,18 @@ class ImperativeQuantizeOutputs(object):
self._set_skip_quant_attr(infer_program) self._set_skip_quant_attr(infer_program)
clip_extra = False
if onnx_format:
graph = IrGraph(core.Graph(infer_program.desc), for_test=False)
transform_pass = ReplaceFakeQuantDequantPass(scope, place)
transform_pass.apply(graph)
quant_weight_pass = QuantWeightPass(scope, place)
quant_weight_pass.apply(graph)
infer_program = graph.to_program()
clip_extra = True
save_inference_model( save_inference_model(
dirname=dirname, dirname=dirname,
feeded_var_names=feed_target_names, feeded_var_names=feed_target_names,
...@@ -506,7 +526,7 @@ class ImperativeQuantizeOutputs(object): ...@@ -506,7 +526,7 @@ class ImperativeQuantizeOutputs(object):
main_program=infer_program.clone(), main_program=infer_program.clone(),
model_filename=model_filename, model_filename=model_filename,
params_filename=params_filename, params_filename=params_filename,
clip_extra=False) clip_extra=clip_extra)
if is_dynamic_mode: if is_dynamic_mode:
paddle.disable_static() paddle.disable_static()
......
...@@ -18,10 +18,7 @@ import numpy as np ...@@ -18,10 +18,7 @@ import numpy as np
import paddle import paddle
import paddle.nn.quant.quant_layers as quant_layers import paddle.nn.quant.quant_layers as quant_layers
from ..quantization_pass import _get_op_input_var_names from ..utils import _get_op_input_var_names, _get_op_output_var_names, _get_output_name_index, _get_input_name_index
from ..quantization_pass import _get_op_output_var_names
from ..quantization_pass import _get_output_name_index
from ..quantization_pass import _get_input_name_index
layer_name_map = { layer_name_map = {
'Conv2DTranspose': paddle.nn.Conv2DTranspose, 'Conv2DTranspose': paddle.nn.Conv2DTranspose,
......
...@@ -25,18 +25,10 @@ from .... import unique_name ...@@ -25,18 +25,10 @@ from .... import unique_name
from ....executor import global_scope, Executor from ....executor import global_scope, Executor
from ....framework import IrGraph from ....framework import IrGraph
from ....log_helper import get_logger from ....log_helper import get_logger
from .quantization_pass import QuantizationTransformPass from .quantization_pass import QuantizationTransformPass, QuantizationTransformPassV2, QuantizationFreezePass, QuantWeightPass, AddQuantDequantPass, AddQuantDequantPassV2
from .quantization_pass import QuantizationFreezePass
from .quantization_pass import AddQuantDequantPass
from .quantization_pass import _out_scale_op_list
from .quantization_pass import _get_op_input_var_names
from .quantization_pass import _get_op_output_var_names
from .quantization_pass import _get_output_name_index
from .quantization_pass import _get_input_name_index
from .quantization_pass import _channelwise_quant_axis1_ops
from .cal_kl_threshold import cal_kl_threshold from .cal_kl_threshold import cal_kl_threshold
from .adaround import run_adaround from .adaround import run_adaround
from .utils import load_variable_data, set_variable_data from . import utils
__all__ = ['PostTrainingQuantization', 'WeightQuantization'] __all__ = ['PostTrainingQuantization', 'WeightQuantization']
...@@ -131,6 +123,7 @@ class PostTrainingQuantization(object): ...@@ -131,6 +123,7 @@ class PostTrainingQuantization(object):
weight_bits=8, weight_bits=8,
activation_quantize_type='range_abs_max', activation_quantize_type='range_abs_max',
weight_quantize_type='channel_wise_abs_max', weight_quantize_type='channel_wise_abs_max',
onnx_format=False,
optimize_model=False, optimize_model=False,
is_use_cache_file=False, is_use_cache_file=False,
cache_dir=None): cache_dir=None):
...@@ -203,6 +196,8 @@ class PostTrainingQuantization(object): ...@@ -203,6 +196,8 @@ class PostTrainingQuantization(object):
the fake ops in saving quantized model, and we save the scale obtained the fake ops in saving quantized model, and we save the scale obtained
by post training quantization in fake ops. Compared to 'abs_max', by post training quantization in fake ops. Compared to 'abs_max',
the model accuracy is usually higher when it is 'channel_wise_abs_max'. the model accuracy is usually higher when it is 'channel_wise_abs_max'.
onnx_format(bool): Whether to export the quantized model with format of ONNX.
Default is False.
optimize_model(bool, optional): If set optimize_model as True, it applies optimize_model(bool, optional): If set optimize_model as True, it applies
some passes to the model before quantization, and it supports some passes to the model before quantization, and it supports
`conv2d/depthwise_conv2d + bn` pass so far. Some targets require the `conv2d/depthwise_conv2d + bn` pass so far. Some targets require the
...@@ -265,8 +260,8 @@ class PostTrainingQuantization(object): ...@@ -265,8 +260,8 @@ class PostTrainingQuantization(object):
self._learning_rate = learning_rate self._learning_rate = learning_rate
self._dynamic_quantize_op_type = ['lstm'] self._dynamic_quantize_op_type = ['lstm']
self._support_quantize_op_type = \ self._support_quantize_op_type = \
list(set(QuantizationTransformPass._supported_quantizable_op_type + list(set(utils._weight_supported_quantizable_op_type +
AddQuantDequantPass._supported_quantizable_op_type + utils._act_supported_quantizable_op_type +
self._dynamic_quantize_op_type)) self._dynamic_quantize_op_type))
# Check inputs # Check inputs
...@@ -305,6 +300,7 @@ class PostTrainingQuantization(object): ...@@ -305,6 +300,7 @@ class PostTrainingQuantization(object):
self._weight_bits = weight_bits self._weight_bits = weight_bits
self._activation_quantize_type = activation_quantize_type self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type self._weight_quantize_type = weight_quantize_type
self._onnx_format = onnx_format
self._is_full_quantize = is_full_quantize self._is_full_quantize = is_full_quantize
if is_full_quantize: if is_full_quantize:
self._quantizable_op_type = self._support_quantize_op_type self._quantizable_op_type = self._support_quantize_op_type
...@@ -322,7 +318,7 @@ class PostTrainingQuantization(object): ...@@ -322,7 +318,7 @@ class PostTrainingQuantization(object):
self._fetch_list = None self._fetch_list = None
self._data_loader = data_loader self._data_loader = data_loader
self._out_scale_op_list = _out_scale_op_list self._out_scale_op_list = utils._out_scale_op_list
self._quantized_weight_var_name = set() self._quantized_weight_var_name = set()
self._quantized_act_var_name = set() self._quantized_act_var_name = set()
self._weight_op_pairs = {} self._weight_op_pairs = {}
...@@ -391,22 +387,27 @@ class PostTrainingQuantization(object): ...@@ -391,22 +387,27 @@ class PostTrainingQuantization(object):
break break
_logger.info("Finish sampling stage, all batch: " + str(batch_id)) _logger.info("Finish sampling stage, all batch: " + str(batch_id))
if self._round_type == 'adaround':
self._adaround_apply()
self._reset_activation_persistable()
if self._algo == 'avg': if self._algo == 'avg':
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
self._quantized_threshold[var_name] = \ self._quantized_threshold[var_name] = \
np.array(self._quantized_var_avg[var_name]).mean() np.array(self._quantized_var_avg[var_name]).mean()
if self._algo in ["KL", "hist"]: if self._algo in ["KL", "hist"]:
self._calculate_kl_hist_threshold() self._calculate_kl_hist_threshold()
if self._algo in ["KL", "abs_max", "hist", "avg", "mse", "emd"]:
self._update_program() if self._round_type == 'adaround':
else: self._adaround_apply()
self._reset_activation_persistable()
if self._algo is 'min_max':
self._save_input_threhold() self._save_input_threhold()
else:
self._update_program()
# save out_threshold for quantized ops.
if not self._onnx_format:
self._save_output_threshold()
self._save_output_threshold()
if any(op_type in self._quantizable_op_type if any(op_type in self._quantizable_op_type
for op_type in self._dynamic_quantize_op_type): for op_type in self._dynamic_quantize_op_type):
self._collect_dynamic_quantize_op_threshold( self._collect_dynamic_quantize_op_threshold(
...@@ -431,6 +432,7 @@ class PostTrainingQuantization(object): ...@@ -431,6 +432,7 @@ class PostTrainingQuantization(object):
return self._program return self._program
def _adaround_apply(self): def _adaround_apply(self):
assert self._algo != "min_max", "The algo should not be min_max."
if self._algo in ["KL", "hist"]: if self._algo in ["KL", "hist"]:
scale_dict = self._quantized_var_threshold scale_dict = self._quantized_var_threshold
else: else:
...@@ -466,6 +468,7 @@ class PostTrainingQuantization(object): ...@@ -466,6 +468,7 @@ class PostTrainingQuantization(object):
Returns: Returns:
None None
''' '''
clip_extra = True if self._onnx_format else False
io.save_inference_model( io.save_inference_model(
dirname=save_model_path, dirname=save_model_path,
model_filename=model_filename, model_filename=model_filename,
...@@ -473,7 +476,8 @@ class PostTrainingQuantization(object): ...@@ -473,7 +476,8 @@ class PostTrainingQuantization(object):
feeded_var_names=self._feed_list, feeded_var_names=self._feed_list,
target_vars=self._fetch_list, target_vars=self._fetch_list,
executor=self._executor, executor=self._executor,
main_program=self._program) main_program=self._program,
clip_extra=clip_extra)
_logger.info("The quantized model is saved in " + save_model_path) _logger.info("The quantized model is saved in " + save_model_path)
def _load_model_data(self): def _load_model_data(self):
...@@ -551,22 +555,22 @@ class PostTrainingQuantization(object): ...@@ -551,22 +555,22 @@ class PostTrainingQuantization(object):
# For quantized ops, sample inputs and outputs # For quantized ops, sample inputs and outputs
if op_type in self._quantizable_op_type: if op_type in self._quantizable_op_type:
collect_var_name( collect_var_name(
_get_op_input_var_names(op), persistable_var_names, utils._get_op_input_var_names(op),
op_type) persistable_var_names, op_type)
collect_var_name( collect_var_name(
_get_op_output_var_names(op), persistable_var_names, utils._get_op_output_var_names(op),
op_type) persistable_var_names, op_type)
# collect quanted op output var name # collect quanted op output var name
for out_var_name in _get_op_output_var_names(op): for out_var_name in utils._get_op_output_var_names(op):
for in_var_name in _get_op_input_var_names(op): for in_var_name in utils._get_op_input_var_names(op):
if in_var_name in persistable_var_names: if in_var_name in persistable_var_names:
self._quantized_op_pairs[ self._quantized_op_pairs[
in_var_name] = out_var_name in_var_name] = out_var_name
# For other op, only sample output scale # For other op, only sample output scale
elif op_type in self._out_scale_op_list: elif op_type in self._out_scale_op_list:
collect_var_name( collect_var_name(
_get_op_output_var_names(op), persistable_var_names, utils._get_op_output_var_names(op),
op_type) persistable_var_names, op_type)
def _set_activation_persistable(self): def _set_activation_persistable(self):
''' '''
...@@ -608,13 +612,13 @@ class PostTrainingQuantization(object): ...@@ -608,13 +612,13 @@ class PostTrainingQuantization(object):
def _sample_mse(self): def _sample_mse(self):
if self._quantized_threshold == {}: if self._quantized_threshold == {}:
for var_name in self._quantized_weight_var_name: for var_name in self._quantized_weight_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
if self._weight_quantize_type == "abs_max": if self._weight_quantize_type == "abs_max":
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
elif self._weight_quantize_type == "channel_wise_abs_max": elif self._weight_quantize_type == "channel_wise_abs_max":
abs_max_value = [] abs_max_value = []
if self._weight_op_pairs[ if self._weight_op_pairs[
var_name] in _channelwise_quant_axis1_ops: var_name] in utils._channelwise_quant_axis1_ops:
for i in range(var_tensor.shape[1]): for i in range(var_tensor.shape[1]):
abs_max_value.append( abs_max_value.append(
float(np.max(np.abs(var_tensor[:, i])))) float(np.max(np.abs(var_tensor[:, i]))))
...@@ -625,7 +629,7 @@ class PostTrainingQuantization(object): ...@@ -625,7 +629,7 @@ class PostTrainingQuantization(object):
self._quantized_threshold[var_name] = abs_max_value self._quantized_threshold[var_name] = abs_max_value
_logger.info("MSE searching stage ...") _logger.info("MSE searching stage ...")
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
var_tensor = var_tensor.flatten() var_tensor = var_tensor.flatten()
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
...@@ -647,13 +651,13 @@ class PostTrainingQuantization(object): ...@@ -647,13 +651,13 @@ class PostTrainingQuantization(object):
def _sample_emd(self): def _sample_emd(self):
if self._quantized_threshold == {}: if self._quantized_threshold == {}:
for var_name in self._quantized_weight_var_name: for var_name in self._quantized_weight_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
if self._weight_quantize_type == "abs_max": if self._weight_quantize_type == "abs_max":
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
elif self._weight_quantize_type == "channel_wise_abs_max": elif self._weight_quantize_type == "channel_wise_abs_max":
abs_max_value = [] abs_max_value = []
if self._weight_op_pairs[ if self._weight_op_pairs[
var_name] in _channelwise_quant_axis1_ops: var_name] in utils._channelwise_quant_axis1_ops:
for i in range(var_tensor.shape[1]): for i in range(var_tensor.shape[1]):
abs_max_value.append( abs_max_value.append(
float(np.max(np.abs(var_tensor[:, i])))) float(np.max(np.abs(var_tensor[:, i]))))
...@@ -664,7 +668,7 @@ class PostTrainingQuantization(object): ...@@ -664,7 +668,7 @@ class PostTrainingQuantization(object):
self._quantized_threshold[var_name] = abs_max_value self._quantized_threshold[var_name] = abs_max_value
_logger.info("EMD searching stage ...") _logger.info("EMD searching stage ...")
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
var_tensor = var_tensor.flatten() var_tensor = var_tensor.flatten()
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
...@@ -688,13 +692,13 @@ class PostTrainingQuantization(object): ...@@ -688,13 +692,13 @@ class PostTrainingQuantization(object):
def _sample_avg(self): def _sample_avg(self):
if self._quantized_threshold == {}: if self._quantized_threshold == {}:
for var_name in self._quantized_weight_var_name: for var_name in self._quantized_weight_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
if self._weight_quantize_type == "abs_max": if self._weight_quantize_type == "abs_max":
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
elif self._weight_quantize_type == "channel_wise_abs_max": elif self._weight_quantize_type == "channel_wise_abs_max":
abs_max_value = [] abs_max_value = []
if self._weight_op_pairs[ if self._weight_op_pairs[
var_name] in _channelwise_quant_axis1_ops: var_name] in utils._channelwise_quant_axis1_ops:
for i in range(var_tensor.shape[1]): for i in range(var_tensor.shape[1]):
abs_max_value.append( abs_max_value.append(
float(np.max(np.abs(var_tensor[:, i])))) float(np.max(np.abs(var_tensor[:, i]))))
...@@ -705,7 +709,7 @@ class PostTrainingQuantization(object): ...@@ -705,7 +709,7 @@ class PostTrainingQuantization(object):
self._quantized_threshold[var_name] = abs_max_value self._quantized_threshold[var_name] = abs_max_value
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
if (var_name not in self._quantized_var_avg): if (var_name not in self._quantized_var_avg):
self._quantized_var_avg[var_name] = [] self._quantized_var_avg[var_name] = []
...@@ -717,13 +721,13 @@ class PostTrainingQuantization(object): ...@@ -717,13 +721,13 @@ class PostTrainingQuantization(object):
def _sample_abs_max(self): def _sample_abs_max(self):
if self._quantized_threshold == {}: if self._quantized_threshold == {}:
for var_name in self._quantized_weight_var_name: for var_name in self._quantized_weight_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
if self._weight_quantize_type == "abs_max": if self._weight_quantize_type == "abs_max":
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
elif self._weight_quantize_type == "channel_wise_abs_max": elif self._weight_quantize_type == "channel_wise_abs_max":
abs_max_value = [] abs_max_value = []
if self._weight_op_pairs[ if self._weight_op_pairs[
var_name] in _channelwise_quant_axis1_ops: var_name] in utils._channelwise_quant_axis1_ops:
for i in range(var_tensor.shape[1]): for i in range(var_tensor.shape[1]):
abs_max_value.append( abs_max_value.append(
float(np.max(np.abs(var_tensor[:, i])))) float(np.max(np.abs(var_tensor[:, i]))))
...@@ -734,7 +738,7 @@ class PostTrainingQuantization(object): ...@@ -734,7 +738,7 @@ class PostTrainingQuantization(object):
self._quantized_threshold[var_name] = abs_max_value self._quantized_threshold[var_name] = abs_max_value
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
abs_max_value = float(np.max(np.abs(var_tensor))) abs_max_value = float(np.max(np.abs(var_tensor)))
if (var_name not in self._quantized_threshold) or \ if (var_name not in self._quantized_threshold) or \
(abs_max_value > self._quantized_threshold[var_name]): (abs_max_value > self._quantized_threshold[var_name]):
...@@ -743,7 +747,7 @@ class PostTrainingQuantization(object): ...@@ -743,7 +747,7 @@ class PostTrainingQuantization(object):
def _sample_min_max(self): def _sample_min_max(self):
if self._quantized_var_min == {} and self._quantized_var_max == {}: if self._quantized_var_min == {} and self._quantized_var_max == {}:
for var_name in self._quantized_weight_var_name: for var_name in self._quantized_weight_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
if self._weight_quantize_type == "abs_max": if self._weight_quantize_type == "abs_max":
min_value = float(np.min(var_tensor)) min_value = float(np.min(var_tensor))
max_value = float(np.max(var_tensor)) max_value = float(np.max(var_tensor))
...@@ -751,7 +755,7 @@ class PostTrainingQuantization(object): ...@@ -751,7 +755,7 @@ class PostTrainingQuantization(object):
min_value = [] min_value = []
max_value = [] max_value = []
if self._weight_op_pairs[ if self._weight_op_pairs[
var_name] in _channelwise_quant_axis1_ops: var_name] in utils._channelwise_quant_axis1_ops:
for i in range(var_tensor.shape[1]): for i in range(var_tensor.shape[1]):
min_value.append(float(np.min(var_tensor[:, i]))) min_value.append(float(np.min(var_tensor[:, i])))
max_value.append(float(np.max(var_tensor[:, i]))) max_value.append(float(np.max(var_tensor[:, i])))
...@@ -763,7 +767,7 @@ class PostTrainingQuantization(object): ...@@ -763,7 +767,7 @@ class PostTrainingQuantization(object):
self._quantized_var_max[var_name] = max_value self._quantized_var_max[var_name] = max_value
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
min_value = float(np.min(var_tensor)) min_value = float(np.min(var_tensor))
max_value = float(np.max(var_tensor)) max_value = float(np.max(var_tensor))
if (var_name not in self._quantized_var_min) or \ if (var_name not in self._quantized_var_min) or \
...@@ -775,7 +779,7 @@ class PostTrainingQuantization(object): ...@@ -775,7 +779,7 @@ class PostTrainingQuantization(object):
def _sample_histogram(self): def _sample_histogram(self):
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
var_tensor_abs = np.abs(var_tensor) var_tensor_abs = np.abs(var_tensor)
bins = self._sampling_act_histogram[var_name][1] bins = self._sampling_act_histogram[var_name][1]
hist, _ = np.histogram(var_tensor_abs, bins=bins) hist, _ = np.histogram(var_tensor_abs, bins=bins)
...@@ -790,7 +794,7 @@ class PostTrainingQuantization(object): ...@@ -790,7 +794,7 @@ class PostTrainingQuantization(object):
for block_id in range(len(self._program.blocks)): for block_id in range(len(self._program.blocks)):
for op in self._program.blocks[block_id].ops: for op in self._program.blocks[block_id].ops:
if op.type in self._quantizable_op_type: if op.type in self._quantizable_op_type:
for var_name in _get_op_input_var_names(op): for var_name in utils._get_op_input_var_names(op):
assert var_name in self._quantized_var_min assert var_name in self._quantized_var_min
assert var_name in self._quantized_var_max assert var_name in self._quantized_var_max
op._set_attr(var_name + ".min", op._set_attr(var_name + ".min",
...@@ -805,7 +809,7 @@ class PostTrainingQuantization(object): ...@@ -805,7 +809,7 @@ class PostTrainingQuantization(object):
get the min and max value, and then calculate the threshold. get the min and max value, and then calculate the threshold.
''' '''
for var_name in self._quantized_act_var_name: for var_name in self._quantized_act_var_name:
var_tensor = load_variable_data(self._scope, var_name) var_tensor = utils.load_variable_data(self._scope, var_name)
var_tensor = np.abs(var_tensor) var_tensor = np.abs(var_tensor)
min_value = float(np.min(var_tensor)) min_value = float(np.min(var_tensor))
max_value = float(np.max(var_tensor)) max_value = float(np.max(var_tensor))
...@@ -839,13 +843,13 @@ class PostTrainingQuantization(object): ...@@ -839,13 +843,13 @@ class PostTrainingQuantization(object):
# Abs_max threshold for weights # Abs_max threshold for weights
for var_name in self._quantized_weight_var_name: for var_name in self._quantized_weight_var_name:
weight_data = load_variable_data(self._scope, var_name) weight_data = utils.load_variable_data(self._scope, var_name)
if self._weight_quantize_type == "abs_max": if self._weight_quantize_type == "abs_max":
weight_threshold = float(np.max(np.abs(weight_data))) weight_threshold = float(np.max(np.abs(weight_data)))
elif self._weight_quantize_type == "channel_wise_abs_max": elif self._weight_quantize_type == "channel_wise_abs_max":
weight_threshold = [] weight_threshold = []
if self._weight_op_pairs[ if self._weight_op_pairs[
var_name] in _channelwise_quant_axis1_ops: var_name] in utils._channelwise_quant_axis1_ops:
for i in range(weight_data.shape[1]): for i in range(weight_data.shape[1]):
weight_threshold.append( weight_threshold.append(
float(np.max(np.abs(weight_data[:, i])))) float(np.max(np.abs(weight_data[:, i]))))
...@@ -876,17 +880,27 @@ class PostTrainingQuantization(object): ...@@ -876,17 +880,27 @@ class PostTrainingQuantization(object):
# use QuantizationTransformPass to insert fake_quant/fake_dequantize op # use QuantizationTransformPass to insert fake_quant/fake_dequantize op
major_quantizable_op_types = [] major_quantizable_op_types = []
for op_type in QuantizationTransformPass._supported_quantizable_op_type: for op_type in utils._weight_supported_quantizable_op_type:
if op_type in self._quantizable_op_type: if op_type in self._quantizable_op_type:
major_quantizable_op_types.append(op_type) major_quantizable_op_types.append(op_type)
transform_pass = QuantizationTransformPass( if not self._onnx_format:
scope=self._scope, transform_pass = QuantizationTransformPass(
place=self._place, scope=self._scope,
weight_bits=self._weight_bits, place=self._place,
activation_bits=self._activation_bits, weight_bits=self._weight_bits,
activation_quantize_type=self._activation_quantize_type, activation_bits=self._activation_bits,
weight_quantize_type=self._weight_quantize_type, activation_quantize_type=self._activation_quantize_type,
quantizable_op_type=major_quantizable_op_types) weight_quantize_type=self._weight_quantize_type,
quantizable_op_type=major_quantizable_op_types)
else:
transform_pass = QuantizationTransformPassV2(
scope=self._scope,
place=self._place,
weight_bits=self._weight_bits,
activation_bits=self._activation_bits,
activation_quantize_type=self._activation_quantize_type,
weight_quantize_type=self._weight_quantize_type,
quantizable_op_type=major_quantizable_op_types)
for sub_graph in graph.all_sub_graphs(): for sub_graph in graph.all_sub_graphs():
# Insert fake_quant/fake_dequantize op must in test graph, so # Insert fake_quant/fake_dequantize op must in test graph, so
...@@ -896,13 +910,20 @@ class PostTrainingQuantization(object): ...@@ -896,13 +910,20 @@ class PostTrainingQuantization(object):
# use AddQuantDequantPass to insert fake_quant_dequant op # use AddQuantDequantPass to insert fake_quant_dequant op
minor_quantizable_op_types = [] minor_quantizable_op_types = []
for op_type in AddQuantDequantPass._supported_quantizable_op_type: for op_type in utils._act_supported_quantizable_op_type:
if op_type in self._quantizable_op_type: if op_type in self._quantizable_op_type:
minor_quantizable_op_types.append(op_type) minor_quantizable_op_types.append(op_type)
add_quant_dequant_pass = AddQuantDequantPass( if not self._onnx_format:
scope=self._scope, add_quant_dequant_pass = AddQuantDequantPass(
place=self._place, scope=self._scope,
quantizable_op_type=minor_quantizable_op_types) place=self._place,
quantizable_op_type=minor_quantizable_op_types)
else:
add_quant_dequant_pass = AddQuantDequantPassV2(
scope=self._scope,
place=self._place,
quantizable_op_type=minor_quantizable_op_types,
is_full_quantized=self._is_full_quantize)
for sub_graph in graph.all_sub_graphs(): for sub_graph in graph.all_sub_graphs():
sub_graph._for_test = True sub_graph._for_test = True
...@@ -914,33 +935,39 @@ class PostTrainingQuantization(object): ...@@ -914,33 +935,39 @@ class PostTrainingQuantization(object):
else: else:
scale_dict = self._quantized_threshold scale_dict = self._quantized_threshold
for key, val in scale_dict.items(): for key, val in scale_dict.items():
set_variable_data( utils.set_variable_data(
self._scope, self._scope,
self._place, self._place,
key + ".scale", key + ".scale",
np.array( np.array(
[val], dtype=np.float32)) [val], dtype=np.float32))
set_variable_data( utils.set_variable_data(
self._scope, self._scope,
self._place, self._place,
key + ".quant_dequant.scale", key + ".quant_dequant.scale",
np.array( np.array(
[val], dtype=np.float32)) [val], dtype=np.float32))
# apply QuantizationFreezePass, and obtain the final quant model if not self._onnx_format:
freeze_pass = QuantizationFreezePass( # apply QuantizationFreezePass, and obtain the final quant model
scope=self._scope, freeze_pass = QuantizationFreezePass(
place=self._place, scope=self._scope,
bias_correction=self._bias_correction, place=self._place,
weight_bits=self._weight_bits, bias_correction=self._bias_correction,
round_type=self._round_type, weight_bits=self._weight_bits,
activation_bits=self._activation_bits, round_type=self._round_type,
weight_quantize_type=self._weight_quantize_type, activation_bits=self._activation_bits,
quantizable_op_type=major_quantizable_op_types) weight_quantize_type=self._weight_quantize_type,
quantizable_op_type=major_quantizable_op_types)
for sub_graph in graph.all_sub_graphs():
sub_graph._for_test = True for sub_graph in graph.all_sub_graphs():
freeze_pass.apply(sub_graph) sub_graph._for_test = True
freeze_pass.apply(sub_graph)
else:
quant_weight_pass = QuantWeightPass(self._scope, self._place)
for sub_graph in graph.all_sub_graphs():
sub_graph._for_test = True
quant_weight_pass.apply(sub_graph)
self._program = graph.to_program() self._program = graph.to_program()
...@@ -960,7 +987,7 @@ class PostTrainingQuantization(object): ...@@ -960,7 +987,7 @@ class PostTrainingQuantization(object):
op._set_attr("quantization_type", quantized_type) op._set_attr("quantization_type", quantized_type)
def analysis_and_save_info(op_node, out_var_name): def analysis_and_save_info(op_node, out_var_name):
argname_index = _get_output_name_index(op_node, out_var_name) argname_index = utils._get_output_name_index(op_node, out_var_name)
assert argname_index is not None, \ assert argname_index is not None, \
out_var_name + " is not the output of the op" out_var_name + " is not the output of the op"
if self._algo == "KL": if self._algo == "KL":
...@@ -997,7 +1024,7 @@ class PostTrainingQuantization(object): ...@@ -997,7 +1024,7 @@ class PostTrainingQuantization(object):
for op in self._program.blocks[block_id].ops: for op in self._program.blocks[block_id].ops:
if op.type in ( if op.type in (
self._quantizable_op_type + self._out_scale_op_list): self._quantizable_op_type + self._out_scale_op_list):
out_var_names = _get_op_output_var_names(op) out_var_names = utils._get_op_output_var_names(op)
for var_name in out_var_names: for var_name in out_var_names:
analysis_and_save_info(op, var_name) analysis_and_save_info(op, var_name)
...@@ -1020,11 +1047,11 @@ class PostTrainingQuantization(object): ...@@ -1020,11 +1047,11 @@ class PostTrainingQuantization(object):
quantization_type = str("post_" + self._algo).lower() quantization_type = str("post_" + self._algo).lower()
persistable_var_names = _all_persistable_var_names(self._program) persistable_var_names = _all_persistable_var_names(self._program)
for op in target_ops: for op in target_ops:
for var_name in _get_op_input_var_names(op): for var_name in utils._get_op_input_var_names(op):
if var_name in persistable_var_names: if var_name in persistable_var_names:
var_data = load_variable_data(self._scope, var_name) var_data = utils.load_variable_data(self._scope, var_name)
threshold = float(np.max(np.abs(var_data))) threshold = float(np.max(np.abs(var_data)))
argname, index = _get_input_name_index(op, var_name) argname, index = utils._get_input_name_index(op, var_name)
op._set_attr(argname + str(index) + "_threshold", threshold) op._set_attr(argname + str(index) + "_threshold", threshold)
op._set_attr("quantization_type", quantization_type) op._set_attr("quantization_type", quantization_type)
op._set_attr("bit_length", self._weight_bits) op._set_attr("bit_length", self._weight_bits)
...@@ -1268,7 +1295,7 @@ class WeightQuantization(object): ...@@ -1268,7 +1295,7 @@ class WeightQuantization(object):
save_weight_dtype = np.int8 if weight_bits == 8 else np.int16 save_weight_dtype = np.int8 if weight_bits == 8 else np.int16
# Get quantized scale and weight data # Get quantized scale and weight data
weight_data = load_variable_data(scope, var_name) weight_data = utils.load_variable_data(scope, var_name)
if abs(threshold_rate) < 1e-10: if abs(threshold_rate) < 1e-10:
threshold_value = np.max(np.abs(weight_data)) threshold_value = np.max(np.abs(weight_data))
else: else:
...@@ -1282,11 +1309,13 @@ class WeightQuantization(object): ...@@ -1282,11 +1309,13 @@ class WeightQuantization(object):
# Set weight data # Set weight data
if not for_test: if not for_test:
set_variable_data(scope, place, var_name, quantized_weight_data) utils.set_variable_data(scope, place, var_name,
quantized_weight_data)
else: else:
dequantized_weight_data = \ dequantized_weight_data = \
(quantized_weight_data * scale).astype(np.float32) (quantized_weight_data * scale).astype(np.float32)
set_variable_data(scope, place, var_name, dequantized_weight_data) utils.set_variable_data(scope, place, var_name,
dequantized_weight_data)
# Save info # Save info
op._set_attr('quantization_type', 'post_weight_abs_max') op._set_attr('quantization_type', 'post_weight_abs_max')
...@@ -1303,7 +1332,7 @@ class WeightQuantization(object): ...@@ -1303,7 +1332,7 @@ class WeightQuantization(object):
save_weight_dtype = np.int8 if weight_bits == 8 else np.int16 save_weight_dtype = np.int8 if weight_bits == 8 else np.int16
# Get quantized scale and weight data # Get quantized scale and weight data
weight_data = load_variable_data(scope, var_name) weight_data = utils.load_variable_data(scope, var_name)
if op.type == "mul": if op.type == "mul":
scales, quantized_weight_data = \ scales, quantized_weight_data = \
self._mul_channel_wise_quantization(weight_data, self._mul_channel_wise_quantization(weight_data,
...@@ -1317,7 +1346,8 @@ class WeightQuantization(object): ...@@ -1317,7 +1346,8 @@ class WeightQuantization(object):
# Set weight data # Set weight data
if not for_test: if not for_test:
set_variable_data(scope, place, var_name, quantized_weight_data) utils.set_variable_data(scope, place, var_name,
quantized_weight_data)
else: else:
if op.type == "mul": if op.type == "mul":
dequantized_weight_data = \ dequantized_weight_data = \
...@@ -1328,7 +1358,8 @@ class WeightQuantization(object): ...@@ -1328,7 +1358,8 @@ class WeightQuantization(object):
else: else:
_logger.error(op.type + _logger.error(op.type +
" is not supported by weight quantization") " is not supported by weight quantization")
set_variable_data(scope, place, var_name, dequantized_weight_data) utils.set_variable_data(scope, place, var_name,
dequantized_weight_data)
# Save info # Save info
op._set_attr('quantization_type', 'post_weight_channel_wise_abs_max') op._set_attr('quantization_type', 'post_weight_channel_wise_abs_max')
......
...@@ -26,12 +26,20 @@ from ....data import data ...@@ -26,12 +26,20 @@ from ....data import data
from ....layers import mean from ....layers import mean
from ....executor import scope_guard from ....executor import scope_guard
from ....framework import _get_paddle_place from ....framework import _get_paddle_place
from .utils import _channelwise_quant_axis1_ops, quant_tensor from . import utils
__all__ = [ __all__ = [
'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass', 'QuantizationTransformPass',
'TransformForMobilePass', 'OutScaleForTrainingPass', 'QuantizationFreezePass',
'OutScaleForInferencePass', 'AddQuantDequantPass' 'ConvertToInt8Pass',
'TransformForMobilePass',
'OutScaleForTrainingPass',
'OutScaleForInferencePass',
'AddQuantDequantPass',
'QuantizationTransformPassV2',
'AddQuantDequantPassV2',
'ReplaceFakeQuantDequantPass',
'QuantWeightPass',
] ]
_fake_quant_op_list = [ _fake_quant_op_list = [
...@@ -44,278 +52,13 @@ _fake_dequant_op_list = [ ...@@ -44,278 +52,13 @@ _fake_dequant_op_list = [
] ]
_fake_quant_dequant_op_list = [ _fake_quant_dequant_op_list = [
'fake_quantize_dequantize_moving_average_abs_max' 'fake_quantize_dequantize_moving_average_abs_max',
"fake_channel_wise_quantize_dequantize_abs_max",
] ]
_out_scale_op_list = [
"conv2d",
"depthwise_conv2d",
"mul",
"matmul",
"matmul_v2",
"relu",
"leaky_relu",
"relu6",
"sigmoid",
"tanh",
"prelu",
"swish",
"dropout",
"softmax",
"batch_norm",
"layer_norm",
"elementwise_add",
"pool2d",
"reshape2",
"transpose2",
"concat",
"elementwise_mul",
"elementwise_pow",
"elementwise_sub",
"scale",
"slice",
"hard_swish",
"hard_sigmoid",
"conv2d_transpose",
"gru",
"bilinear_interp",
"nearest_interp",
"trilinear_interp",
"flatten",
"flatten2",
"transpose",
"pad2d",
"pad3d",
"reshape",
"split",
"flatten_contiguous_range",
"squeeze",
"squeeze2",
"nearest_interp_v2",
"fill_constant_batch_size_like",
"bilinear_interp",
"bilinear_interp_v2",
"arg_max",
"abs",
"assign",
"cast",
"clip",
"box_coder",
"crop",
"cumsum",
"equal",
"expand_v2",
"fill_any_like",
"fill_constant",
"gelu",
"instance_norm",
"lookup_table",
"lookup_table_v2",
"norm",
"p_norm",
"pow",
"reduce_mean",
"stack",
"top_k_v2",
"unsqueeze",
"unsqueeze2",
"logical_and",
"logical_not",
"meshgrid",
"roi_align",
"strided_slice",
"where",
"grid_sampler",
"tile",
"group_norm",
"reduce_sum",
"square",
"softplus",
"gather",
"shuffle_channel",
]
# list op real input and output names, to avoid processing input such as AxisTensor.
_op_real_in_out_name = {
"conv2d": [["Input", "Filter"], ["Output"]],
"depthwise_conv2d": [["Input", "Filter"], ["Output"]],
"conv2d_transpose": [["Input", "Filter"], ["Output"]],
"mul": [["X", "Y"], ["Out"]],
"matmul": [["X", "Y"], ["Out"]],
"matmul_v2": [["X", "Y"], ["Out"]],
"pool2d": [["X"], ["Out"]],
"elementwise_add": [["X", "Y"], ["Out"]],
"concat": [["X"], ["Out"]],
"softmax": [["X"], ["Out"]],
"argmax": [["X"], ["Out"]],
"transpose": [["X"], ["Out"]],
"equal": [["X", "Y"], ["Out"]],
"gather": [["X"], ["Out"]],
"greater_equal": [["X", "Y"], ["Out"]],
"greater_than": [["X", "Y"], ["Out"]],
"less_equal": [["X", "Y"], ["Out"]],
"less_than": [["X", "Y"], ["Out"]],
"mean": [["X"], ["Out"]],
"not_equal": [["X", "Y"], ["Out"]],
"reshape": [["X"], ["Out"]],
"reshape2": [["X"], ["Out"]],
"transpose2": [["X"], ["Out"]],
"bilinear_interp": [["X"], ["Out"]],
"nearest_interp": [["X"], ["Out"]],
"trilinear_interp": [["X"], ["Out"]],
"slice": [["Input"], ["Out"]],
"squeeze": [["X"], ["Out"]],
"elementwise_sub": [["X", "Y"], ["Out"]],
"relu": [["X"], ["Out"]],
"relu6": [["X"], ["Out"]],
"leaky_relu": [["X"], ["Out"]],
"prelu": [["X", "Alpha"], ["Out"]],
"tanh": [["X"], ["Out"]],
"swish": [["X"], ["Out"]],
"dropout": [["X"], ["Out"]],
"batch_norm": [["X"], ["Y"]],
"layer_norm": [["X"], ["Y"]],
"sigmoid": [["X"], ["Out"]],
"elementwise_mul": [["X", "Y"], ["Out"]],
"elementwise_pow": [["X", "Y"], ["Out"]],
"scale": [["X"], ["Out"]],
"hard_swish": [["X"], ["Out"]],
"hard_sigmoid": [["X"], ["Out"]],
"gru": [["Input", "Weight"], ["Hidden"]],
"lstm": [["Input", "Weight"], ["Hidden"]],
"pad2d": [["X"], ["Out"]],
"pad3d": [["X"], ["Out"]],
"flatten": [["X"], ["Out"]],
"flatten2": [["X"], ["Out"]],
"unsqueeze2": [["X"], ["Out"]],
"unsqueeze2": [["X"], ["Out"]],
"flatten_contiguous_range": [["X"], ["Out"]],
"split": [["X"], ["Out"]],
"squeeze2": [["X"], ["Out"]],
"nearest_interp_v2": [["X"], ["Out"]],
"bilinear_interp": [["X"], ["Out"]],
"bilinear_interp_v2": [["X"], ["Out"]],
"fill_constant_batch_size_like": [["Input"], ["Out"]],
"arg_max": [["X"], ["Out"]],
"abs": [["X"], ["Out"]],
"assign": [["X"], ["Out"]],
"cast": [["X"], ["Out"]],
"clip": [["X"], ["Out"]],
"box_coder": [["PriorBox"], ["OutputBox"]],
"crop": [["X"], ["Out"]],
"cumsum": [["X"], ["Out"]],
"expand_v2": [["X"], ["Out"]],
"fill_any_like": [["X"], ["Out"]],
"fill_constant": [[], ["Out"]],
"gelu": [["X"], ["Out"]],
"instance_norm": [["X"], ["Out"]],
"lookup_table": [["W", "Ids"], ["Out"]],
"lookup_table_v2": [["W", "Ids"], ["Out"]],
"norm": [["X"], ["Norm"]],
"p_norm": [["X"], ["Out"]],
"pow": [["X"], ["Out"]],
"reduce_mean": [["X"], ["Out"]],
"stack": [["X"], ["Y"]],
"top_k_v2": [["X"], ["Out", "Indices"]],
"logical_and": [["X", "Y"], ["Out"]],
"logical_not": [["X"], ["Out"]],
"meshgrid": [["X"], ["Out"]],
"roi_align": [["X", "ROIs"], ["Out"]],
"strided_slice": [["Input"], ["Out"]],
"where": [["Condition", "X", "Y"], ["Out"]],
"grid_sampler": [["X", "Grid"], ["Output"]],
"tile": [["X"], ["Out"]],
"group_norm": [["X"], ["Y", "Mean", "Variance"]],
"reduce_sum": [["X"], ["Out"]],
"square": [["X"], ["Out"]],
"softplus": [["X"], ["Out"]],
"shuffle_channel": [["X"], ["Out"]],
}
_conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
_SCALE_DEFAULT_VALUE = 0.001
def _get_op_input_var_names(op):
"""
Get the input var names of the op.
Args:
op(IrNode, Operator): the input op.
Returns:
input_var_names or None.
"""
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
var_names = []
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return []
name_list = _op_real_in_out_name[op_name][0]
for name in name_list:
var_name = op.input(name)
if isinstance(var_name, list):
var_names.extend(var_name)
else:
var_names.append(var_name)
return var_names
def _get_input_name_index(op, input_var_name):
"""Get the input name and index of the var_name in the op"""
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return None
res = None
for argname in _op_real_in_out_name[op_name][0]:
var_names = op.input(argname)
for index, name in enumerate(var_names):
if name == input_var_name:
res = (argname, index)
return res
def _get_op_output_var_names(op):
""" """
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
var_names = []
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return []
name_list = _op_real_in_out_name[op_name][1]
for name in name_list:
var_name = op.output(name)
if isinstance(var_name, list):
var_names.extend(var_name)
else:
var_names.append(var_name)
return var_names
def _get_output_name_index(op, output_var_name):
"""Get the output name and index of the var_name in the op"""
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return None
name_list = _op_real_in_out_name[op_name][1]
res = None
for name in name_list:
var_name = op.output(name)
for index, val in enumerate(var_name):
if val == output_var_name:
res = (name, index)
return res
def _init_var_node(var_node, value, scope, place): def _init_var_node(var_node, value, scope, place):
...@@ -334,7 +77,7 @@ def _is_input_all_not_persistable(graph, op_node): ...@@ -334,7 +77,7 @@ def _is_input_all_not_persistable(graph, op_node):
Analyse the real inputs of the op node are all not persistable. Analyse the real inputs of the op node are all not persistable.
''' '''
is_input_all_not_persistable = True is_input_all_not_persistable = True
for var_name in _get_op_input_var_names(op_node): for var_name in utils._get_op_input_var_names(op_node):
in_node = graph._find_node_by_name(op_node.inputs, var_name) in_node = graph._find_node_by_name(op_node.inputs, var_name)
is_input_all_not_persistable = (is_input_all_not_persistable and \ is_input_all_not_persistable = (is_input_all_not_persistable and \
(not in_node.persistable())) (not in_node.persistable()))
...@@ -360,10 +103,6 @@ class QuantizationTransformPass(object): ...@@ -360,10 +103,6 @@ class QuantizationTransformPass(object):
Quantize the ops that have weights. Add quant and dequant ops for Quantize the ops that have weights. Add quant and dequant ops for
the quantized ops's inputs. the quantized ops's inputs.
""" """
_supported_quantizable_op_type = [
'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul',
'matmul_v2'
]
def __init__(self, def __init__(self,
scope=None, scope=None,
...@@ -493,7 +232,7 @@ class QuantizationTransformPass(object): ...@@ -493,7 +232,7 @@ class QuantizationTransformPass(object):
self._quantizable_ops = quantizable_op_type self._quantizable_ops = quantizable_op_type
for op in self._quantizable_ops: for op in self._quantizable_ops:
assert op in QuantizationTransformPass._supported_quantizable_op_type, \ assert op in utils._weight_supported_quantizable_op_type, \
op + " is not supported for quantization." op + " is not supported for quantization."
self._quantizable_grad_ops = [ self._quantizable_grad_ops = [
'%s_grad' % (op) for op in self._quantizable_ops '%s_grad' % (op) for op in self._quantizable_ops
...@@ -588,7 +327,7 @@ class QuantizationTransformPass(object): ...@@ -588,7 +327,7 @@ class QuantizationTransformPass(object):
else self._activation_quantize_type else self._activation_quantize_type
if quant_type == 'channel_wise_abs_max': # Weight quantization if quant_type == 'channel_wise_abs_max': # Weight quantization
quant_axis = 1 if op.name() in \ quant_axis = 1 if op.name() in \
_channelwise_quant_axis1_ops else 0 utils._channelwise_quant_axis1_ops else 0
quant_var_node, scale_var_node = self._insert_channel_quant_op( quant_var_node, scale_var_node = self._insert_channel_quant_op(
graph, var_node, name, quant_bits, quant_axis) graph, var_node, name, quant_bits, quant_axis)
dequant_var_node = self._insert_channel_dequant_op( dequant_var_node = self._insert_channel_dequant_op(
...@@ -753,7 +492,7 @@ class QuantizationTransformPass(object): ...@@ -753,7 +492,7 @@ class QuantizationTransformPass(object):
_init_var_node( _init_var_node(
scale_in_node, scale_in_node,
np.array( np.array(
[0.001], dtype=data_type), [_SCALE_DEFAULT_VALUE], dtype=data_type),
self._scope, self._scope,
self._place) self._place)
...@@ -821,7 +560,7 @@ class QuantizationTransformPass(object): ...@@ -821,7 +560,7 @@ class QuantizationTransformPass(object):
_init_var_node( _init_var_node(
scale_in_node, scale_in_node,
np.array( np.array(
[0.001], dtype=data_type), [_SCALE_DEFAULT_VALUE], dtype=data_type),
self._scope, self._scope,
self._place) self._place)
...@@ -1289,17 +1028,21 @@ class QuantizationFreezePass(object): ...@@ -1289,17 +1028,21 @@ class QuantizationFreezePass(object):
if self._round_type == 'round': if self._round_type == 'round':
if any( if any(
_check_grandchild_op_node(op_node, op) _check_grandchild_op_node(op_node, op)
for op in _channelwise_quant_axis1_ops): for op in utils._channelwise_quant_axis1_ops):
quant_axis = 1 quant_axis = 1
else: else:
quant_axis = 0 quant_axis = 0
quantized_param_v = quant_tensor(param_v.copy(), quantized_param_v = utils.quant_tensor(
scale_v, quant_axis, param_v.copy(), scale_v, quant_axis,
self._weight_bits) self._weight_bits)
quantized_param_v = np.round(quantized_param_v) quantized_param_v = np.round(quantized_param_v)
if self._bias_correction == True: if self._bias_correction == True:
quantized_param_v = self._bias_correction_w( quantized_param_v = utils.bias_correction_w(
param_v, quantized_param_v, scale_v, quant_axis) param_v,
quantized_param_v,
scale_v,
quant_axis,
weight_bits=self._weight_bits)
quantized_param_v = np.round(quantized_param_v) quantized_param_v = np.round(quantized_param_v)
self._restore_var(input_arg_name, quantized_param_v) self._restore_var(input_arg_name, quantized_param_v)
self._remove_fake_quant_and_dequant_op(graph, op_node) self._remove_fake_quant_and_dequant_op(graph, op_node)
...@@ -1319,7 +1062,7 @@ class QuantizationFreezePass(object): ...@@ -1319,7 +1062,7 @@ class QuantizationFreezePass(object):
op_node_desc.attr("quantization_type") == "qat_with_weight": op_node_desc.attr("quantization_type") == "qat_with_weight":
if self._weight_quantize_type == 'channel_wise_abs_max': if self._weight_quantize_type == 'channel_wise_abs_max':
quant_axis = 1 if op_node.name() in \ quant_axis = 1 if op_node.name() in \
_channelwise_quant_axis1_ops else 0 utils._channelwise_quant_axis1_ops else 0
self._insert_post_channel_dequant_op(graph, op_node, self._insert_post_channel_dequant_op(graph, op_node,
quant_axis) quant_axis)
else: else:
...@@ -1519,46 +1262,6 @@ class QuantizationFreezePass(object): ...@@ -1519,46 +1262,6 @@ class QuantizationFreezePass(object):
return isinstance(v, float) or isinstance(v, np.float32) \ return isinstance(v, float) or isinstance(v, np.float32) \
or isinstance(v, np.float64) or isinstance(v, np.float64)
def _bias_correction_w(self, x, x_quant, scale_v, quant_axis):
'''
Bias correction for weight
'''
eps = 1e-8
bnt = (1 << (self._weight_bits - 1)) - 1
x_dequant = x_quant.copy()
if isinstance(scale_v, list):
if quant_axis == 0:
for i, s in enumerate(scale_v):
x_dequant[i] = x_dequant[i] * s / bnt
quant_bias = x - x_dequant
mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
std_orig = x.reshape(x.shape[0], -1).std(-1)
std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
std_bias = std_orig / (std_quant + eps)
else:
for i, s in enumerate(scale_v):
x_dequant[:, i] = x_quant[:, i] * s / bnt
quant_bias = x - x_dequant
mean_bias = np.array([
quant_bias[:, i].mean() for i in range(quant_bias.shape[1])
])
std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
std_quant = np.array(
[x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
std_bias = std_orig / (std_quant + eps)
else:
x_dequant = x_quant * scale_v / bnt
mean_bias = (x - x_dequant).mean()
std_bias = x.std() / (x_dequant.std() + eps)
if mean_bias.ndim == 1:
std_bias = np.resize(std_bias, x.shape)
mean_bias = np.resize(mean_bias, x.shape)
x_dequant = (mean_bias + x_dequant) * std_bias
quantized_param_v = quant_tensor(x_dequant, scale_v, quant_axis,
self._weight_bits)
return quantized_param_v
class ConvertToInt8Pass(object): class ConvertToInt8Pass(object):
def __init__(self, scope, place, quantizable_op_type=None): def __init__(self, scope, place, quantizable_op_type=None):
...@@ -1707,7 +1410,7 @@ class OutScaleForTrainingPass(object): ...@@ -1707,7 +1410,7 @@ class OutScaleForTrainingPass(object):
self._place = _get_paddle_place(place) self._place = _get_paddle_place(place)
self._moving_rate = moving_rate self._moving_rate = moving_rate
self._is_test = None self._is_test = None
self._teller_set = _out_scale_op_list self._teller_set = utils._out_scale_op_list
def apply(self, graph): def apply(self, graph):
""" """
...@@ -1725,7 +1428,7 @@ class OutScaleForTrainingPass(object): ...@@ -1725,7 +1428,7 @@ class OutScaleForTrainingPass(object):
if op.name() in self._teller_set: if op.name() in self._teller_set:
target_ops.append(op) target_ops.append(op)
for op in target_ops: for op in target_ops:
for output_var_name in _get_op_output_var_names(op): for output_var_name in utils._get_op_output_var_names(op):
in_node = graph._find_node_by_name(op.outputs, output_var_name) in_node = graph._find_node_by_name(op.outputs, output_var_name)
if in_node.dtype() not in \ if in_node.dtype() not in \
[core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
...@@ -1796,14 +1499,13 @@ class OutScaleForTrainingPass(object): ...@@ -1796,14 +1499,13 @@ class OutScaleForTrainingPass(object):
graph.link_to(accum_in_node, scale_op_node) graph.link_to(accum_in_node, scale_op_node)
graph.link_to(scale_op_node, state_out_node) graph.link_to(scale_op_node, state_out_node)
graph.link_to(scale_op_node, accum_out_node) graph.link_to(scale_op_node, accum_out_node)
graph.resolve_hazard()
return graph return graph
def _scale_name(self, var_name): def _scale_name(self, var_name):
""" """
Return the scale name for the var named `var_name`. Return the scale name for the var named `var_name`.
""" """
return "%s@scale" % (var_name) return "%s.scale" % (var_name)
class OutScaleForInferencePass(object): class OutScaleForInferencePass(object):
...@@ -1816,7 +1518,7 @@ class OutScaleForInferencePass(object): ...@@ -1816,7 +1518,7 @@ class OutScaleForInferencePass(object):
scope(fluid.Scope): The scope is used to initialize these new parameters. scope(fluid.Scope): The scope is used to initialize these new parameters.
""" """
self._scope = scope self._scope = scope
self._teller_set = _out_scale_op_list self._teller_set = utils._out_scale_op_list
def apply(self, graph): def apply(self, graph):
""" """
...@@ -1831,7 +1533,7 @@ class OutScaleForInferencePass(object): ...@@ -1831,7 +1533,7 @@ class OutScaleForInferencePass(object):
op_nodes = graph.all_op_nodes() op_nodes = graph.all_op_nodes()
for op_node in op_nodes: for op_node in op_nodes:
if op_node.name() in self._teller_set: if op_node.name() in self._teller_set:
var_names = _get_op_output_var_names(op_node) var_names = utils._get_op_output_var_names(op_node)
for var_name in var_names: for var_name in var_names:
in_node = graph._find_node_by_name(op_node.outputs, in_node = graph._find_node_by_name(op_node.outputs,
var_name) var_name)
...@@ -1848,7 +1550,8 @@ class OutScaleForInferencePass(object): ...@@ -1848,7 +1550,8 @@ class OutScaleForInferencePass(object):
# For compatibility, we save output threshold by two methods. # For compatibility, we save output threshold by two methods.
op_node.op()._set_attr("out_threshold", float(scale_value)) op_node.op()._set_attr("out_threshold", float(scale_value))
argname_index = _get_output_name_index(op_node, var_name) argname_index = utils._get_output_name_index(op_node,
var_name)
assert argname_index is not None, \ assert argname_index is not None, \
var_name + " is not the output of the op" var_name + " is not the output of the op"
op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \ op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \
...@@ -1861,7 +1564,7 @@ class OutScaleForInferencePass(object): ...@@ -1861,7 +1564,7 @@ class OutScaleForInferencePass(object):
""" """
Return the scale name for the var named `var_name`. Return the scale name for the var named `var_name`.
""" """
return "%s@scale" % (var_name) return "%s.scale" % (var_name)
class AddQuantDequantPass(object): class AddQuantDequantPass(object):
...@@ -1869,95 +1572,6 @@ class AddQuantDequantPass(object): ...@@ -1869,95 +1572,6 @@ class AddQuantDequantPass(object):
Quantize the ops that do not have weights, and add quant_dequant op for the Quantize the ops that do not have weights, and add quant_dequant op for the
quantized ops's inputs. quantized ops's inputs.
""" """
_supported_quantizable_op_type = [
"pool2d",
"elementwise_add",
"concat",
"softmax",
"argmax",
"transpose",
"equal",
"gather",
"greater_equal",
"greater_than",
"less_equal",
"less_than",
"mean",
"not_equal",
"reshape",
"reshape2",
"dropout",
"bilinear_interp",
"nearest_interp",
"trilinear_interp",
"slice",
"squeeze",
"elementwise_sub",
"mul",
"matmul",
"relu",
"relu6",
"leaky_relu",
"tanh",
"swish",
"scale",
"transpose",
"transpose2",
"sigmoid",
"pad2d",
"flatten",
"flatten2",
"batch_norm",
"layer_norm",
"matmul_v2",
"split",
"flatten_contiguous_range",
"squeeze2",
"nearest_interp_v2",
"bilinear_interp",
"bilinear_interp_v2",
"fill_constant_batch_size_like",
"arg_max",
"abs",
"assign",
"cast",
"clip",
"box_coder",
"crop",
"cumsum",
"elementwise_mul",
"elementwise_pow",
"expand_v2",
"fill_any_like",
"fill_constant",
"gelu",
"hard_sigmoid",
"hard_swish",
"instance_norm",
"lookup_table",
"lookup_table_v2",
"norm",
"p_norm",
"pad3d",
"pow",
"prelu",
"reduce_mean",
"unsqueeze",
"unsqueeze2",
"logical_and",
"logical_not",
"meshgrid",
"roi_align",
"strided_slice",
"where",
"grid_sampler",
"tile",
"group_norm",
"reduce_sum",
"square",
"softplus",
"shuffle_channel",
]
# To be compatible with PaddleSlim, not remove _activation_type for now # To be compatible with PaddleSlim, not remove _activation_type for now
_activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"] _activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"]
...@@ -2000,12 +1614,11 @@ class AddQuantDequantPass(object): ...@@ -2000,12 +1614,11 @@ class AddQuantDequantPass(object):
self._skip_pattern = skip_pattern self._skip_pattern = skip_pattern
if is_full_quantized: if is_full_quantized:
self._quantizable_op_type = \ self._quantizable_op_type = utils._act_supported_quantizable_op_type
AddQuantDequantPass._supported_quantizable_op_type
else: else:
self._quantizable_op_type = quantizable_op_type self._quantizable_op_type = quantizable_op_type
for op_type in quantizable_op_type: for op_type in quantizable_op_type:
assert op_type in AddQuantDequantPass._supported_quantizable_op_type, \ assert op_type in utils._act_supported_quantizable_op_type, \
op_type + " is not supported for quantization." op_type + " is not supported for quantization."
self._quantizable_grad_op_type = [ self._quantizable_grad_op_type = [
'%s_grad' % (op) for op in self._quantizable_op_type '%s_grad' % (op) for op in self._quantizable_op_type
...@@ -2050,7 +1663,7 @@ class AddQuantDequantPass(object): ...@@ -2050,7 +1663,7 @@ class AddQuantDequantPass(object):
"qat_without_weight") "qat_without_weight")
op_node.op()._set_attr("activation_bits", self._quant_bits) op_node.op()._set_attr("activation_bits", self._quant_bits)
op_node.op()._set_attr("with_quant_attr", True) op_node.op()._set_attr("with_quant_attr", True)
arg_names = _get_op_input_var_names(op_node) arg_names = utils._get_op_input_var_names(op_node)
for arg_name in arg_names: for arg_name in arg_names:
in_node = graph._find_node_by_name(op_node.inputs, arg_name) in_node = graph._find_node_by_name(op_node.inputs, arg_name)
if arg_name in dequantized_vars_map: if arg_name in dequantized_vars_map:
...@@ -2095,7 +1708,7 @@ class AddQuantDequantPass(object): ...@@ -2095,7 +1708,7 @@ class AddQuantDequantPass(object):
_init_var_node( _init_var_node(
scale_in_node, scale_in_node,
np.array( np.array(
[0.001], dtype=data_type), [_SCALE_DEFAULT_VALUE], dtype=data_type),
self._scope, self._scope,
self._place) self._place)
...@@ -2162,3 +1775,870 @@ class AddQuantDequantPass(object): ...@@ -2162,3 +1775,870 @@ class AddQuantDequantPass(object):
graph.link_to(quant_op_node, accum_out_node) graph.link_to(quant_op_node, accum_out_node)
return quant_var_node, scale_out_node return quant_var_node, scale_out_node
class InsertQuantizeLinear(object):
"""
Insert quantize_linear and dequantize_linear op before ops.
Args:
place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to restore the weight tensors.
If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
scope(paddle.Scope): scope is used to get the weight tensor values.
quant_bits(int, optional): quantization bit number for weight. Default is 8.
quant_axis(int, optional): quantization dimension of channels. When it is greater than or
equal to 0, it will quantization with per channel, else quantization with per layer.
Default is -1.
channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
is_test(bool, optional): Whether quantization with training or not. Default is True.
"""
def __init__(self,
place,
scope,
quant_bits=8,
quant_axis=-1,
channel_wise=False,
is_test=True):
self._place = place
self._scope = scope
self.quant_bits = quant_bits
self.quant_axis = quant_axis
self.channel_wise = channel_wise
self._is_test = is_test
def insert_quant_op(self, graph, var_node):
assert var_node.is_var(), '{} is not a var'.format(var_node.name())
quant_var_node = graph.create_var_node(
name=self._quantized_var_name(var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
data_type = 'float64' if var_node.dtype(
) == core.VarDesc.VarType.FP64 else 'float32'
if self.channel_wise:
scale_var_shape = var_node.shape()[self.quant_axis]
scale_var_type = core.VarDesc.VarType.LOD_TENSOR
init_scale_value = np.zeros(scale_var_shape, dtype=data_type)
else:
scale_var_shape = 1
scale_var_type = var_node.type()
init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type)
scale_var_node = graph.create_persistable_node(
name=self._quantized_scale_name(var_node.name()),
var_type=scale_var_type,
shape=[scale_var_shape],
var_dtype=var_node.dtype())
_init_var_node(scale_var_node, init_scale_value, self._scope,
self._place)
zero_point_node = None
if zero_point_node is None:
zero_point_node = graph.create_persistable_node(
name=self._zero_point_name(quant_var_node.name()),
var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=scale_var_node.shape(),
var_dtype=core.VarDesc.VarType.INT32)
_init_var_node(
zero_point_node,
np.zeros(
scale_var_node.shape(), dtype="int32"),
self._scope,
self._place)
inputs = {"X": var_node, "Scale": scale_var_node}
if zero_point_node is not None:
inputs["ZeroPoint"] = zero_point_node
attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
outputs = {"Y": quant_var_node}
if not self._is_test:
attrs["is_test"] = self._is_test
attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
scale_out_node = graph.create_var_node_from_desc(scale_var_node.var(
))
outputs["OutScale"] = scale_out_node
quant_op_node = graph.create_op_node(
op_type="quantize_linear",
attrs=attrs,
inputs=inputs,
outputs=outputs)
graph.link_to(var_node, quant_op_node)
graph.link_to(scale_var_node, quant_op_node)
if zero_point_node is not None:
graph.link_to(zero_point_node, quant_op_node)
graph.link_to(quant_op_node, quant_var_node)
if not self._is_test:
graph.link_to(quant_op_node, scale_out_node)
return quant_var_node, scale_var_node
def insert_dequant_op(self, graph, var_node, scale_var_node):
assert var_node.is_var(), '{} is not a var'.format(var_node.name())
dequant_var_node = graph.create_var_node(
name=self._dequantized_var_name(var_node.name()),
var_type=var_node.type(),
shape=var_node.shape(),
var_dtype=var_node.dtype())
zero_point_node = None
if zero_point_node is None:
zero_point_node = graph.create_persistable_node(
name=self._zero_point_name(dequant_var_node.name()),
var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=scale_var_node.shape(),
var_dtype=core.VarDesc.VarType.INT32)
_init_var_node(
zero_point_node,
np.zeros(
scale_var_node.shape(), dtype="int32"),
self._scope,
self._place)
inputs = {"X": var_node, "Scale": scale_var_node}
if zero_point_node is not None:
inputs["ZeroPoint"] = zero_point_node
attrs = {"quant_axis": self.quant_axis, "bit_length": self.quant_bits}
if not self._is_test:
attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
quant_op_node = graph.create_op_node(
op_type="dequantize_linear",
attrs=attrs,
inputs=inputs,
outputs={"Y": dequant_var_node})
graph.link_to(var_node, quant_op_node)
graph.link_to(scale_var_node, quant_op_node)
if zero_point_node is not None:
graph.link_to(zero_point_node, quant_op_node)
graph.link_to(quant_op_node, dequant_var_node)
return dequant_var_node
def _quantized_var_name(self, var_name):
"""
Return quantized variable name for the input `var_name`.
"""
return "%s.quantized" % (var_name)
def _dequantized_var_name(self, var_name):
"""
Return dequantized variable name for the input `var_name`.
"""
return "%s.dequantized" % (var_name)
def _quantized_scale_name(self, var_name):
"""
Return the scale name of quantized variable for the input `var_name`.
"""
return "%s.scale" % (var_name)
def _zero_point_name(self, var_name):
"""
Return the scale name for the var named `var_name`.
"""
return "%s@zero_point" % (var_name)
class QuantizationTransformPassV2(object):
"""
Quantize the ops that have weights. Add quant and dequant ops for
the quantized ops's inputs.
"""
def __init__(self,
scope=None,
place=None,
weight_bits=8,
activation_bits=8,
activation_quantize_type='abs_max',
weight_quantize_type='abs_max',
window_size=10000,
moving_rate=0.9,
skip_pattern=['skip_quant'],
quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'],
weight_quantize_func=None,
act_quantize_func=None,
weight_preprocess_func=None,
act_preprocess_func=None,
optimizer_func=None,
executor=None):
r"""
Args:
scope(paddle.Scope): When activation use 'range_abs_max' as the quantize
type, this pass will create some new parameters. The scope is used to
initialize these new parameters.
place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
parameters described above. If it's string, It can be ``cpu``, and ``gpu:x``,
where ``x`` is the index of the GPUs.
weight_bits(int): quantization bit number for weights,
the bias is not quantized.
activation_bits(int): quantization bit number for activation.
activation_quantize_type(str): quantization type for activation,
now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
If use 'abs_max' mode, the quantization scale will be calculated
dynamically each step in both training and testing period. If use
'range_abs_max', a static quantization scale will be calculated
during training and used in inference.
weight_quantize_type(str): quantization type for weights,
support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
usually is not used for weight, since weights are fixed once the
model is well trained.
window_size(int): the window size for 'range_abs_max' quantization.
moving_rate(float): the param for 'moving_average_abs_max' quantization.
skip_pattern(str or str list): The user-defined quantization skip pattern, which
will be presented in the name scope of an op. When the skip pattern is
detected in an op's name scope, the corresponding op will not be quantized.
quantizable_op_type(list[str]): List the type of ops that will be quantized.
Default is ["conv2d", "depthwise_conv2d", "mul"]. The quantizable_op_type in
QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
weight_quantize_func(function): Function that defines how to quantize weight.
Using this can quickly test if user's quantization method works or not.
In this function, user should both define quantization function and
dequantization function, that is, the function's input is non-quantized
weight and function returns dequantized weight. If None, will use
quantization op defined by 'weight_quantize_type'. Default is None.
act_quantize_func(function): Function that defines how to quantize activation.
Using this can quickly test if user's quantization method works or not.
In this function, user should both define quantization and dequantization
process, that is, the function's input is non-quantized activation and
function returns dequantized activation. If None, will use quantization
op defined by 'activation_quantize_type'. Default is None.
weight_preprocess_func(function): Function that defines how to preprocess
weight before quantization. Using this can quickly test if user's preprocess
method works or not. The function's input is non-quantized weight and
function returns processed weight to be quantized. If None, the weight will
be quantized directly. Default is None.
act_preprocess_func(function): Function that defines how to preprocess
activation before quantization. Using this can quickly test if user's
preprocess method works or not. The function's input is non-quantized
activation and function returns processed activation to be quantized.
If None, the activation will be quantized directly. Default is None.
optimizer_func(function): Fuction return a optimizer. When 'is_test' is
False and user want to use self-defined quantization function and
preprocess function, this function must be set. Default is None.
executor(paddle.Executor): If user want to use self-defined quantization
function and preprocess function, executor must be set for initialization.
Default is None.
Examples:
.. code-block:: python
# The original graph will be rewrite.
import paddle
from paddle.fluid.contrib.slim.quantization \
import QuantizationTransformPassV2
from paddle.fluid.contrib.slim.graph import IrGraph
from paddle.fluid import core
graph = IrGraph(core.Graph(program.desc), for_test=False)
place = paddle.CPUPlace()
scope = paddle.static.global_scope()
transform_pass = QuantizationTransformPassV2(scope, place)
transform_pass.apply(graph)
"""
self._scope = scope
self._place = _get_paddle_place(place)
self._weight_bits = weight_bits
self._activation_bits = activation_bits
self._skip_pattern = skip_pattern
self._weight_quantize_func = weight_quantize_func
self._act_quantize_func = act_quantize_func
self._weight_preprocess_func = weight_preprocess_func
self._act_preprocess_func = act_preprocess_func
self._optimizer = optimizer_func
self._exe = executor
quant_type = [
'abs_max', 'channel_wise_abs_max', 'range_abs_max',
'moving_average_abs_max'
]
assert activation_quantize_type != 'channel_wise_abs_max', \
"The activation quantization type does not support 'channel_wise_abs_max'."
if activation_quantize_type not in quant_type:
raise ValueError(
"Unknown activation_quantize_type : '%s'. It can only be "
"'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." %
(str(activation_quantize_type)))
if weight_quantize_type not in quant_type:
raise ValueError(
"Unknown weight_quantize_type: '%s'. It can only be "
"'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' "
"or 'moving_average_abs_max'." % (str(weight_quantize_type)))
self._activation_quantize_type = activation_quantize_type
self._weight_quantize_type = weight_quantize_type
self._window_size = window_size
self._moving_rate = moving_rate
self._quantizable_ops = quantizable_op_type
for op in self._quantizable_ops:
assert op in utils._weight_supported_quantizable_op_type, \
op + " is not supported for quantization."
self._quantizable_grad_ops = [
'%s_grad' % (op) for op in self._quantizable_ops
]
self._is_test = None
self._global_step = None
self.create_var_map = {}
self.create_op_map = {}
# marked the variable which has been dequantized.
self.dequantized_vars = collections.OrderedDict()
self.persistable_vars = []
self.processed_vars = []
def _quant_preprocess(self, op_node):
user_skipped = False
if isinstance(self._skip_pattern, list):
user_skipped = op_node.op().has_attr("op_namescope") and \
any(pattern in op_node.op().attr("op_namescope") \
for pattern in self._skip_pattern)
elif isinstance(self._skip_pattern, str):
user_skipped = op_node.op().has_attr("op_namescope") and \
op_node.op().attr("op_namescope").find(
self._skip_pattern) != -1
if user_skipped:
op_node.op()._set_attr("skip_quant", True)
op_node.op()._set_attr("with_quant_attr", True)
def _transform_forward(self, graph, op):
op.op()._set_attr("quantization_type", "qat_with_weight")
inputs = op.inputs
for var_node in inputs:
if var_node.name() not in op.input_arg_names():
continue
if var_node.name() in self.dequantized_vars:
dequant_var_node = self.dequantized_vars[var_node.name()]
else:
name = var_node.name()
if name in self.processed_vars:
continue
is_weight = True if var_node.name() in self.persistable_vars \
else False
# if var node is weight and weight_preprocess_func is not None,
# will insert weight preprocess func
# to preorocess weight before quantization
# if var node is activation and act_preprocess_func is not None,
# will insert activation preprocess func
# to preorocess activation before quantization
if is_weight and self._weight_preprocess_func is not None:
var_node = self._insert_func(
graph, self._weight_preprocess_func, var_node, op)
elif not is_weight and self._act_preprocess_func is not None:
var_node = self._insert_func(
graph, self._act_preprocess_func, var_node, op)
# if var node is weight and weight_quantize_func is not None,
# will insert weight quantize func to quantize and dequantize weight
# if var node is activation and act_quantize_func is not None,
# will insert act quantize func to quantize and dequantize activation
if is_weight and self._weight_quantize_func is not None:
target_out_node = self._insert_func(
graph, self._weight_quantize_func, var_node, op)
processed_vars.append(name)
continue
elif not is_weight and self._act_quantize_func is not None:
target_out_node = self._insert_func(
graph, self._act_quantize_func, var_node, op)
processed_vars.append(name)
continue
quant_bits = self._weight_bits if var_node.name() in self.persistable_vars \
else self._activation_bits
quant_type = self._weight_quantize_type if is_weight \
else self._activation_quantize_type
quant_axis = -1
channel_wise = False
if quant_type == 'channel_wise_abs_max': # Weight quantization
channel_wise = True
quant_axis = 1 if op.name() in \
utils._channelwise_quant_axis1_ops else 0
insert_quant_pass = InsertQuantizeLinear(
self._place,
self._scope,
quant_bits=quant_bits,
quant_axis=quant_axis,
channel_wise=channel_wise,
is_test=self._is_test)
quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
graph, var_node)
dequant_var_node = insert_quant_pass.insert_dequant_op(
graph, quant_var_node, scale_var_node)
self.dequantized_vars[name] = dequant_var_node
graph.update_input_link(var_node, dequant_var_node, op)
def _transform_backward(self, graph, op):
for var_node in op.inputs:
if var_node.name() not in op.input_arg_names():
continue
if var_node.name() in self.dequantized_vars:
dequant_var_node = self.dequantized_vars[var_node.name()]
graph.update_input_link(var_node, dequant_var_node, op)
def _has_weight(self, op):
has_weight = False
for var_node in op.inputs:
if var_node.name() not in op.input_arg_names():
continue
name = var_node.name()
if var_node.name() in self.persistable_vars:
has_weight = True
return has_weight
def _is_skip_quant(self, graph, op_node):
"""
Analyse whether the op node skips quantization.
"""
is_skip = False
if op_node.op().has_attr("skip_quant") and \
op_node.op().attr("skip_quant"):
is_skip = True
# if the inputs of mul and matmul are not all persistable, use
# AddQuantDequantPassV2 to quantize them.
if op_node.name() in ["mul", "matmul", "matmul_v2"] and \
_is_input_all_not_persistable(graph, op_node):
is_skip = True
if op_node.op().has_attr("quantization_type") and \
op_node.op().attr("quantization_type") == "qat_without_weight":
is_skip = True
return is_skip
def apply(self, graph):
"""
Quantize the graph for training process. According to weight and
activation quantization type, the graph will be added some fake
quantize operators and fake dequantize operators.
Args:
graph(IrGraph): the applied graph.
Returns:
None
"""
assert isinstance(graph,
IrGraph), 'graph must be the instance of IrGraph.'
self._is_test = graph.is_test()
self.persistable_vars = [
p.name() for p in graph.all_persistable_nodes()
]
ops = graph.all_op_nodes()
# Do the preproccess of quantization, such as skipping some ops
# for not being quantized.
for op in ops:
if op.name() in self._quantizable_ops or \
op.name() in self._quantizable_grad_ops:
self._quant_preprocess(op)
# Insert mapping table to solve the problem in saving inference model.
graph.out_node_mapping_table = dict()
# The process of _transform_forward and _transform_backward is needed in two for loops.
# The loop for transforming the forward graph:
for op in ops:
if op.name() in self._quantizable_ops:
if not self._is_skip_quant(graph, op) and self._has_weight(op):
self._transform_forward(graph, op)
# The loop for renaming the inputs of backward op.
for op in ops:
if op.name() in self._quantizable_grad_ops and self._has_weight(op):
self._transform_backward(graph, op)
return graph
class AddQuantDequantPassV2(object):
"""
Quantize the ops that do not have weights, and add quant_linear and dequant_linear
op for the quantized ops's inputs.
"""
# To be compatible with PaddleSlim, not remove _activation_type for now
_activation_type = ["relu", "relu6", "leaky_relu", "tanh", "swish"]
def __init__(self,
scope=None,
place=None,
moving_rate=0.9,
quant_bits=8,
skip_pattern=["skip_quant"],
quantizable_op_type=["elementwise_add", "pool2d"],
is_full_quantized=False):
"""
Args:
scope(paddle.Scope): The scope is used to initialize these new parameters.
place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
parameters described above. If ``place`` is string, it can be It can be ``cpu``
or ``gpu:x``, where ``x`` is the index of the GPUs.
moving_rate(float, optional): the param for 'quant_dequant_moving_average_abs_max'
quantization. Default is 0.9.
quant_bits(int, optional): quantization bit number for activation. Default is 8.
skip_pattern(str, optional): The user-defined quantization skip pattern, which
will be presented in the name scope of an op. When the skip pattern is
detected in an op's name scope, the corresponding op will not be quantized.
Default is 'skip_quant'.
quantizable_op_type(list[str], optional): List the type of ops that will be
quantized. Default is ["elementwise_add", "pool2d"].
is_full_quantized(bool, optional): If set is_full_quantized as True, apply
quantization to all supported quantizable op type. If set is_full_quantized
as False, only apply quantization to the op type according to the input
quantizable_op_type.
Examples:
.. code-block:: python
# The original graph will be rewrite.
import paddle
from paddle.fluid.contrib.slim.quantization \
import AddQuantDequantPassV2
from paddle.fluid.contrib.slim.graph import IrGraph
from paddle.fluid import core
graph = IrGraph(core.Graph(program.desc), for_test=False)
place = paddle.CPUPlace()
scope = paddle.static.global_scope()
add_quant_dequant_pass = AddQuantDequantPassV2(scope, place)
add_quant_dequant_pass.apply(graph)
"""
self._scope = scope
self._place = _get_paddle_place(place)
self._moving_rate = moving_rate
self._quant_bits = quant_bits
self._is_test = None
self._skip_pattern = skip_pattern
if is_full_quantized:
self._quantizable_op_type = utils._act_supported_quantizable_op_type
else:
self._quantizable_op_type = quantizable_op_type
for op_type in quantizable_op_type:
assert op_type in utils._act_supported_quantizable_op_type, \
op_type + " is not supported for quantization."
self._quantizable_grad_op_type = [
'%s_grad' % (op) for op in self._quantizable_op_type
]
assert self._scope != None, "scope must not be None."
assert self._place != None, "place must not be None."
self.persistable_vars = []
def apply(self, graph):
"""
Add quant_dequant before some ops, such as the 'elementwise_add' and
'pool2d' op.
Args:
graph(IrGraph): the target graph.
Returns:
None
"""
assert isinstance(graph,
IrGraph), 'graph must be the instance of IrGraph.'
self._is_test = graph.is_test()
dequantized_vars_map = collections.OrderedDict()
self.persistable_vars = [
p.name() for p in graph.all_persistable_nodes()
]
# Forward stage, insert quant_dequant op
all_op_nodes = graph.all_op_nodes()
for op_node in all_op_nodes:
if op_node.name() in self._quantizable_op_type:
is_skip = False
if isinstance(self._skip_pattern, list):
is_skip = op_node.op().has_attr("op_namescope") and \
any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
elif isinstance(self._skip_pattern, str):
is_skip = op_node.op().has_attr("op_namescope") and \
op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
is_quantized = op_node.op().has_attr("quantization_type") and \
op_node.op().attr("quantization_type") == "qat_with_weight"
if is_skip or is_quantized:
continue
op_node.op()._set_attr("quantization_type",
"qat_without_weight")
arg_names = utils._get_op_input_var_names(op_node)
for arg_name in arg_names:
in_node = graph._find_node_by_name(op_node.inputs, arg_name)
if in_node.persistable():
continue
if arg_name in dequantized_vars_map:
dequant_var_node = dequantized_vars_map[arg_name]
else:
insert_quant_pass = InsertQuantizeLinear(
self._place,
self._scope,
quant_bits=self._quant_bits,
quant_axis=-1,
channel_wise=False,
is_test=self._is_test)
quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
graph, in_node)
dequant_var_node = insert_quant_pass.insert_dequant_op(
graph, quant_var_node, scale_var_node)
dequantized_vars_map[arg_name] = dequant_var_node
graph.update_input_link(in_node, dequant_var_node, op_node)
# Backward stage, update input link
for op_node in all_op_nodes:
if op_node.name() in self._quantizable_grad_op_type:
for input_name in op_node.input_arg_names():
if input_name in dequantized_vars_map:
in_node = graph._find_node_by_name(op_node.inputs,
input_name)
dequant_var_node = dequantized_vars_map[input_name]
graph.update_input_link(in_node, dequant_var_node,
op_node)
return graph
class ReplaceFakeQuantDequantPass(object):
"""
replace quant-dequant ops with quantize_linear and dequantize_linear ops.
"""
def __init__(self, scope, place):
r"""
Args:
scope(paddle.Scope): The scope is used to initialize these new parameters.
place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
parameters described above. If ``place`` is string, it can be It can be ``cpu``
or ``gpu:x``, where ``x`` is the index of the GPUs.
Examples:
.. code-block:: python
# The original graph will be rewrite.
import paddle
from paddle.fluid.contrib.slim.quantization \
import ReplaceFakeQuantDequantPass
from paddle.fluid.contrib.slim.graph import IrGraph
from paddle.fluid import core
graph = IrGraph(core.Graph(program.desc), for_test=False)
place = paddle.CPUPlace()
scope = paddle.static.global_scope()
replace_pass = ReplaceFakeQuantDequantPass(scope, place)
replace_pass.apply(graph)
"""
self._place = _get_paddle_place(place)
self._scope = scope
assert self._scope != None, "scope must not be None."
assert self._place != None, "place must not be None."
def apply(self, graph):
assert isinstance(graph,
IrGraph), 'graph must be the instance of IrGraph.'
fake_quant_dequant_ops = []
for op in graph.all_op_nodes():
if op.name() in _fake_quant_dequant_op_list:
fake_quant_dequant_ops.append(op)
for _op in fake_quant_dequant_ops:
self._replace_op(graph, _op)
graph.safe_remove_nodes(_op)
graph.resolve_hazard()
return graph
def _replace_op(self, graph, op):
x_node = graph._find_node_by_name(op.inputs, op.input("X")[0])
out_node = graph._find_node_by_name(op.outputs, op.output("Out")[0])
scale_node = graph._find_node_by_name(op.outputs,
op.output("OutScale")[0])
quant_axis = op.op().attr("quant_axis") if op.op().has_attr(
"quant_axis") else -1
bit_length = op.op().attr("bit_length") if op.op().has_attr(
"bit_length") else 8
zero_point_node = None
quanted_node = x_node
if zero_point_node is None:
zero_point_node = graph.create_persistable_node(
name=self._zero_point_name(quanted_node.name()),
var_type=core.VarDesc.VarType.LOD_TENSOR,
shape=scale_node.shape(),
var_dtype=core.VarDesc.VarType.INT32)
_init_var_node(
zero_point_node,
np.zeros(
scale_node.shape(), dtype="int32"),
self._scope,
self._place)
quant_var_node = graph.create_var_node(
name=self._quantized_var_name(x_node.name()),
var_type=x_node.type(),
shape=x_node.shape(),
var_dtype=x_node.dtype())
quant_op_node = graph.create_op_node(
op_type="quantize_linear",
attrs={"quant_axis": quant_axis,
"bit_length": bit_length},
inputs={
"X": x_node,
"Scale": scale_node,
"ZeroPoint": zero_point_node
},
outputs={"Y": quant_var_node})
graph.link_to(x_node, quant_op_node)
graph.link_to(scale_node, quant_op_node)
if zero_point_node is not None:
graph.link_to(zero_point_node, quant_op_node)
graph.link_to(quant_op_node, quant_var_node)
dequant_op_node = graph.create_op_node(
op_type="dequantize_linear",
attrs={"quant_axis": quant_axis,
"bit_length": bit_length},
inputs={
"X": quant_var_node,
"Scale": scale_node,
"ZeroPoint": zero_point_node
},
outputs={"Y": out_node})
graph.link_to(quant_var_node, dequant_op_node)
graph.link_to(scale_node, dequant_op_node)
if zero_point_node is not None:
graph.link_to(zero_point_node, dequant_op_node)
graph.link_to(dequant_op_node, out_node)
def _quantized_var_name(self, var_name):
"""
Return quantized variable name for the input `var_name`.
"""
return "%s.quantized" % (var_name)
def _zero_point_name(self, var_name):
"""
Return the scale name for the var named `var_name`.
"""
return "%s@zero_point" % (var_name)
class QuantWeightPass(object):
"""
quant weights and remove weights input quantize_linear node. for example:
`weight -> quant -> dequant -> conv2d` will be frozen into `weight -> dequant -> conv2d`,
and weight will be scaled offline.
Args:
scope(paddle.Scope): scope is used to get the weight tensor values.
place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to restore the weight tensors.
If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
bias_correction(bool): whether use bias correction for post-training quantization.
https://arxiv.org/abs/1810.05723.
quant_bits(int, optional): quantization bit number for weight. Default is 8.
save_int_weight(bool, optional): Whether the type saving the weight is int. Default is True.
Examples:
.. code-block:: python
# The original graph will be rewrite.
import paddle
from paddle.fluid.contrib.slim.quantization \
import QuantWeightPass
from paddle.fluid.contrib.slim.graph import IrGraph
from paddle.fluid import core
graph = IrGraph(core.Graph(program.desc), for_test=False)
place = paddle.CPUPlace()
scope = paddle.static.global_scope()
quant_weight_pass = QuantWeightPass(scope, place)
quant_weight_pass.apply(graph)
"""
def __init__(self,
scope,
place,
bias_correction=False,
quant_bits=8,
save_int_weight=True):
self._place = _get_paddle_place(place)
self._scope = scope
self._bias_correction = bias_correction
self._quant_bits = quant_bits
self._save_int_weight = save_int_weight
assert self._scope != None, "scope must not be None."
assert self._place != None, "place must not be None."
def apply(self, graph):
assert isinstance(graph,
IrGraph), 'graph must be the instance of IrGraph.'
fake_quant_ops_for_weight = []
fake_quant_ops = [
op for op in graph.all_op_nodes() if op.name() == "quantize_linear"
]
for _op in fake_quant_ops:
x_node = graph._find_node_by_name(_op.inputs, _op.input("X")[0])
if x_node.persistable():
scale_node = graph._find_node_by_name(_op.inputs,
_op.input("Scale")[0])
zero_point_node = graph._find_node_by_name(
_op.inputs, _op.input("ZeroPoint")[0])
out_node = graph._find_node_by_name(_op.outputs,
_op.output("Y")[0])
scale_v = self._load_var(scale_node.name())
assert scale_v.ndim in [1, 2
], "the dim of scale_v should be 1 or 2"
if scale_v.ndim == 2:
scale_v = scale_v[0]
if scale_v.size == 1 and _op.name() == 'abs_max':
scale_v = scale_v[0]
else:
scale_v = scale_v.tolist()
param_v = self._load_var(x_node.name())
quant_axis = _op.op().attr("quant_axis")
bits_length = _op.op().attr("bit_length")
quantized_param_v = utils.quant_tensor(param_v.copy(), scale_v,
quant_axis, bits_length)
if self._bias_correction == True:
quantized_param_v = utils.bias_correction_w(
param_v,
quantized_param_v,
scale_v,
quant_axis,
weight_bits=bits_length)
if self._save_int_weight:
# cast weight type to int
if self._quant_bits == 8:
save_weight_dtype = np.int8
quantized_param_v = quantized_param_v.astype(
save_weight_dtype)
self._restore_var(x_node.name(), quantized_param_v)
for next_op_node in out_node.outputs:
graph.update_input_link(out_node, x_node, next_op_node)
graph.safe_remove_nodes(out_node)
self._remove_unused_var_nodes(graph)
def _remove_unused_var_nodes(self, graph):
all_used_vars = set()
ops = graph.all_op_nodes()
for op_node in ops:
for input_node in op_node.inputs:
all_used_vars.add(input_node)
for output_node in op_node.outputs:
all_used_vars.add(output_node)
all_used_vars = {n.node for n in all_used_vars}
all_unused_vars = {
n
for n in filter(lambda node: node.node not in all_used_vars,
graph.all_var_nodes())
}
graph.safe_remove_nodes(all_unused_vars)
def _load_var(self, name):
return np.array(self._scope.find_var(name).get_tensor())
def _restore_var(self, name, array):
tensor = self._scope.find_var(name).get_tensor()
tensor.set(array, self._place)
...@@ -13,11 +13,292 @@ ...@@ -13,11 +13,292 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
from ....framework import IrNode
from ....framework import Operator
_weight_supported_quantizable_op_type = [
'conv2d', 'depthwise_conv2d', 'conv2d_transpose', 'mul', 'matmul',
'matmul_v2'
]
_act_supported_quantizable_op_type = [
"pool2d",
"elementwise_add",
"concat",
"softmax",
"argmax",
"transpose",
"equal",
"gather",
"greater_equal",
"greater_than",
"less_equal",
"less_than",
"mean",
"not_equal",
"reshape",
"reshape2",
"dropout",
"bilinear_interp",
"nearest_interp",
"trilinear_interp",
"slice",
"squeeze",
"elementwise_sub",
"mul",
"matmul",
"relu",
"relu6",
"leaky_relu",
"tanh",
"swish",
"scale",
"transpose",
"transpose2",
"sigmoid",
"pad2d",
"flatten",
"flatten2",
"batch_norm",
"layer_norm",
"matmul_v2",
"split",
"flatten_contiguous_range",
"squeeze2",
"nearest_interp_v2",
"bilinear_interp",
"bilinear_interp_v2",
"fill_constant_batch_size_like",
"arg_max",
"abs",
"assign",
"cast",
"clip",
"box_coder",
"crop",
"cumsum",
"elementwise_mul",
"elementwise_pow",
"expand_v2",
"fill_any_like",
"fill_constant",
"gelu",
"hard_sigmoid",
"hard_swish",
"instance_norm",
"lookup_table",
"lookup_table_v2",
"norm",
"p_norm",
"pad3d",
"pow",
"prelu",
"reduce_mean",
"unsqueeze",
"unsqueeze2",
"logical_and",
"logical_not",
"meshgrid",
"roi_align",
"strided_slice",
"where",
"grid_sampler",
"tile",
"group_norm",
"reduce_sum",
"square",
"softplus",
"shuffle_channel",
]
_out_scale_op_list = list(
set(_weight_supported_quantizable_op_type +
_act_supported_quantizable_op_type))
_channelwise_quant_axis1_ops = [ _channelwise_quant_axis1_ops = [
'conv2d_transpose', 'mul', 'matmul', 'matmul_v2' 'conv2d_transpose', 'mul', 'matmul', 'matmul_v2'
] ]
# list op real input and output names, to avoid processing input such as AxisTensor.
_op_real_in_out_name = {
"conv2d": [["Input", "Filter"], ["Output"]],
"depthwise_conv2d": [["Input", "Filter"], ["Output"]],
"conv2d_transpose": [["Input", "Filter"], ["Output"]],
"mul": [["X", "Y"], ["Out"]],
"matmul": [["X", "Y"], ["Out"]],
"matmul_v2": [["X", "Y"], ["Out"]],
"pool2d": [["X"], ["Out"]],
"elementwise_add": [["X", "Y"], ["Out"]],
"concat": [["X"], ["Out"]],
"softmax": [["X"], ["Out"]],
"argmax": [["X"], ["Out"]],
"transpose": [["X"], ["Out"]],
"equal": [["X", "Y"], ["Out"]],
"gather": [["X"], ["Out"]],
"greater_equal": [["X", "Y"], ["Out"]],
"greater_than": [["X", "Y"], ["Out"]],
"less_equal": [["X", "Y"], ["Out"]],
"less_than": [["X", "Y"], ["Out"]],
"mean": [["X"], ["Out"]],
"not_equal": [["X", "Y"], ["Out"]],
"reshape": [["X"], ["Out"]],
"reshape2": [["X"], ["Out"]],
"transpose2": [["X"], ["Out"]],
"bilinear_interp": [["X"], ["Out"]],
"nearest_interp": [["X"], ["Out"]],
"trilinear_interp": [["X"], ["Out"]],
"slice": [["Input"], ["Out"]],
"squeeze": [["X"], ["Out"]],
"elementwise_sub": [["X", "Y"], ["Out"]],
"relu": [["X"], ["Out"]],
"relu6": [["X"], ["Out"]],
"leaky_relu": [["X"], ["Out"]],
"prelu": [["X", "Alpha"], ["Out"]],
"tanh": [["X"], ["Out"]],
"swish": [["X"], ["Out"]],
"dropout": [["X"], ["Out"]],
"batch_norm": [["X"], ["Y"]],
"layer_norm": [["X"], ["Y"]],
"sigmoid": [["X"], ["Out"]],
"elementwise_mul": [["X", "Y"], ["Out"]],
"elementwise_pow": [["X", "Y"], ["Out"]],
"scale": [["X"], ["Out"]],
"hard_swish": [["X"], ["Out"]],
"hard_sigmoid": [["X"], ["Out"]],
"gru": [["Input", "Weight"], ["Hidden"]],
"lstm": [["Input", "Weight"], ["Hidden"]],
"pad2d": [["X"], ["Out"]],
"pad3d": [["X"], ["Out"]],
"flatten": [["X"], ["Out"]],
"flatten2": [["X"], ["Out"]],
"unsqueeze2": [["X"], ["Out"]],
"unsqueeze2": [["X"], ["Out"]],
"flatten_contiguous_range": [["X"], ["Out"]],
"split": [["X"], ["Out"]],
"squeeze2": [["X"], ["Out"]],
"nearest_interp_v2": [["X"], ["Out"]],
"bilinear_interp": [["X"], ["Out"]],
"bilinear_interp_v2": [["X"], ["Out"]],
"fill_constant_batch_size_like": [["Input"], ["Out"]],
"arg_max": [["X"], ["Out"]],
"abs": [["X"], ["Out"]],
"assign": [["X"], ["Out"]],
"cast": [["X"], ["Out"]],
"clip": [["X"], ["Out"]],
"box_coder": [["PriorBox"], ["OutputBox"]],
"crop": [["X"], ["Out"]],
"cumsum": [["X"], ["Out"]],
"expand_v2": [["X"], ["Out"]],
"fill_any_like": [["X"], ["Out"]],
"fill_constant": [[], ["Out"]],
"gelu": [["X"], ["Out"]],
"instance_norm": [["X"], ["Out"]],
"lookup_table": [["W", "Ids"], ["Out"]],
"lookup_table_v2": [["W", "Ids"], ["Out"]],
"norm": [["X"], ["Norm"]],
"p_norm": [["X"], ["Out"]],
"pow": [["X"], ["Out"]],
"reduce_mean": [["X"], ["Out"]],
"stack": [["X"], ["Y"]],
"top_k_v2": [["X"], ["Out", "Indices"]],
"logical_and": [["X", "Y"], ["Out"]],
"logical_not": [["X"], ["Out"]],
"meshgrid": [["X"], ["Out"]],
"roi_align": [["X", "ROIs"], ["Out"]],
"strided_slice": [["Input"], ["Out"]],
"where": [["Condition", "X", "Y"], ["Out"]],
"grid_sampler": [["X", "Grid"], ["Output"]],
"tile": [["X"], ["Out"]],
"group_norm": [["X"], ["Y", "Mean", "Variance"]],
"reduce_sum": [["X"], ["Out"]],
"square": [["X"], ["Out"]],
"softplus": [["X"], ["Out"]],
"shuffle_channel": [["X"], ["Out"]],
}
def _get_op_input_var_names(op):
"""
Get the input var names of the op.
Args:
op(IrNode, Operator): the input op.
Returns:
input_var_names or None.
"""
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
var_names = []
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return []
name_list = _op_real_in_out_name[op_name][0]
for name in name_list:
var_name = op.input(name)
if isinstance(var_name, list):
var_names.extend(var_name)
else:
var_names.append(var_name)
return var_names
def _get_op_output_var_names(op):
""" """
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
var_names = []
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return []
name_list = _op_real_in_out_name[op_name][1]
for name in name_list:
var_name = op.output(name)
if isinstance(var_name, list):
var_names.extend(var_name)
else:
var_names.append(var_name)
return var_names
def _get_input_name_index(op, input_var_name):
"""Get the input name and index of the var_name in the op"""
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return None
res = None
for argname in _op_real_in_out_name[op_name][0]:
var_names = op.input(argname)
for index, name in enumerate(var_names):
if name == input_var_name:
res = (argname, index)
return res
def _get_output_name_index(op, output_var_name):
"""Get the output name and index of the var_name in the op"""
assert isinstance(op, (IrNode, Operator)), \
"The input op should be IrNode or Operator."
op_name = op.name() if isinstance(op, IrNode) \
else op.type
if op_name not in _op_real_in_out_name:
return None
name_list = _op_real_in_out_name[op_name][1]
res = None
for name in name_list:
var_name = op.output(name)
for index, val in enumerate(var_name):
if val == output_var_name:
res = (name, index)
return res
def load_variable_data(scope, var_name): def load_variable_data(scope, var_name):
''' '''
...@@ -84,6 +365,46 @@ def dequant_tensor(x, scale, quant_axis=0, weight_bits=8): ...@@ -84,6 +365,46 @@ def dequant_tensor(x, scale, quant_axis=0, weight_bits=8):
return x return x
def bias_correction_w(x, x_quant, scale_v, quant_axis, weight_bits=8):
'''
Bias correction for weight
'''
eps = 1e-8
bnt = (1 << (weight_bits - 1)) - 1
x_dequant = x_quant.copy()
if isinstance(scale_v, list):
if quant_axis == 0:
for i, s in enumerate(scale_v):
x_dequant[i] = x_dequant[i] * s / bnt
quant_bias = x - x_dequant
mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
std_orig = x.reshape(x.shape[0], -1).std(-1)
std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
std_bias = std_orig / (std_quant + eps)
else:
for i, s in enumerate(scale_v):
x_dequant[:, i] = x_quant[:, i] * s / bnt
quant_bias = x - x_dequant
mean_bias = np.array(
[quant_bias[:, i].mean() for i in range(quant_bias.shape[1])])
std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
std_quant = np.array(
[x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
std_bias = std_orig / (std_quant + eps)
else:
x_dequant = x_quant * scale_v / bnt
mean_bias = (x - x_dequant).mean()
std_bias = x.std() / (x_dequant.std() + eps)
if mean_bias.ndim == 1:
std_bias = np.resize(std_bias, x.shape)
mean_bias = np.resize(mean_bias, x.shape)
x_dequant = (mean_bias + x_dequant) * std_bias
quantized_param_v = quant_tensor(x_dequant, scale_v, quant_axis,
weight_bits)
return quantized_param_v
def stable_sigmoid(x): def stable_sigmoid(x):
sig = np.where(x < 0, np.exp(x) / (1 + np.exp(x)), 1 / (1 + np.exp(-x))) sig = np.where(x < 0, np.exp(x) / (1 + np.exp(x)), 1 / (1 + np.exp(-x)))
return sig return sig
......
...@@ -53,7 +53,9 @@ class TestImperativeQat(unittest.TestCase): ...@@ -53,7 +53,9 @@ class TestImperativeQat(unittest.TestCase):
def set_vars(self): def set_vars(self):
self.weight_quantize_type = 'abs_max' self.weight_quantize_type = 'abs_max'
self.activation_quantize_type = 'moving_average_abs_max' self.activation_quantize_type = 'moving_average_abs_max'
print('weight_quantize_type', self.weight_quantize_type) self.onnx_format = False
self.check_export_model_accuracy = True
self.diff_threshold = 0.01
def func_qat(self): def func_qat(self):
self.set_vars() self.set_vars()
...@@ -159,9 +161,13 @@ class TestImperativeQat(unittest.TestCase): ...@@ -159,9 +161,13 @@ class TestImperativeQat(unittest.TestCase):
data = next(test_reader()) data = next(test_reader())
test_data = np.array([x[0].reshape(1, 28, 28) test_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32') for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
test_img = fluid.dygraph.to_variable(test_data) test_img = fluid.dygraph.to_variable(test_data)
label = fluid.dygraph.to_variable(y_data)
lenet.eval() lenet.eval()
before_save = lenet(test_img) fp32_out = lenet(test_img)
fp32_acc = fluid.layers.accuracy(fp32_out, label).numpy()
with tempfile.TemporaryDirectory(prefix="qat_save_path_") as tmpdir: with tempfile.TemporaryDirectory(prefix="qat_save_path_") as tmpdir:
# save inference quantized model # save inference quantized model
...@@ -171,7 +177,8 @@ class TestImperativeQat(unittest.TestCase): ...@@ -171,7 +177,8 @@ class TestImperativeQat(unittest.TestCase):
input_spec=[ input_spec=[
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, 1, 28, 28], dtype='float32') shape=[None, 1, 28, 28], dtype='float32')
]) ],
onnx_format=self.onnx_format)
print('Quantized model saved in %s' % tmpdir) print('Quantized model saved in %s' % tmpdir)
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
...@@ -185,13 +192,15 @@ class TestImperativeQat(unittest.TestCase): ...@@ -185,13 +192,15 @@ class TestImperativeQat(unittest.TestCase):
executor=exe, executor=exe,
model_filename="lenet" + INFER_MODEL_SUFFIX, model_filename="lenet" + INFER_MODEL_SUFFIX,
params_filename="lenet" + INFER_PARAMS_SUFFIX) params_filename="lenet" + INFER_PARAMS_SUFFIX)
after_save, = exe.run(inference_program, quant_out, = exe.run(inference_program,
feed={feed_target_names[0]: test_data}, feed={feed_target_names[0]: test_data},
fetch_list=fetch_targets) fetch_list=fetch_targets)
# check paddle.disable_static()
self.assertTrue( quant_out = fluid.dygraph.to_variable(quant_out)
np.allclose(after_save, before_save.numpy()), quant_acc = fluid.layers.accuracy(quant_out, label).numpy()
msg='Failed to save the inference quantized model.') paddle.enable_static()
delta_value = fp32_acc - quant_acc
self.assertLess(delta_value, self.diff_threshold)
def test_qat(self): def test_qat(self):
with _test_eager_guard(): with _test_eager_guard():
...@@ -199,5 +208,13 @@ class TestImperativeQat(unittest.TestCase): ...@@ -199,5 +208,13 @@ class TestImperativeQat(unittest.TestCase):
self.func_qat() self.func_qat()
class TestImperativeQatONNXFormat(unittest.TestCase):
def set_vars(self):
self.weight_quantize_type = 'abs_max'
self.activation_quantize_type = 'moving_average_abs_max'
self.onnx_format = True
self.diff_threshold = 0.025
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -41,6 +41,17 @@ class TestImperativeQatChannelWise(TestImperativeQat): ...@@ -41,6 +41,17 @@ class TestImperativeQatChannelWise(TestImperativeQat):
def set_vars(self): def set_vars(self):
self.weight_quantize_type = 'channel_wise_abs_max' self.weight_quantize_type = 'channel_wise_abs_max'
self.activation_quantize_type = 'moving_average_abs_max' self.activation_quantize_type = 'moving_average_abs_max'
self.diff_threshold = 0.01
self.onnx_format = False
print('weight_quantize_type', self.weight_quantize_type)
class TestImperativeQatChannelWiseONNXFormat(TestImperativeQat):
def set_vars(self):
self.weight_quantize_type = 'channel_wise_abs_max'
self.activation_quantize_type = 'moving_average_abs_max'
self.onnx_format = True
self.diff_threshold = 0.025
print('weight_quantize_type', self.weight_quantize_type) print('weight_quantize_type', self.weight_quantize_type)
......
...@@ -173,7 +173,8 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -173,7 +173,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
is_use_cache_file=False, is_use_cache_file=False,
is_optimize_model=False, is_optimize_model=False,
batch_size=10, batch_size=10,
batch_nums=10): batch_nums=10,
onnx_format=False):
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -190,14 +191,28 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -190,14 +191,28 @@ class TestPostTrainingQuantization(unittest.TestCase):
round_type=round_type, round_type=round_type,
is_full_quantize=is_full_quantize, is_full_quantize=is_full_quantize,
optimize_model=is_optimize_model, optimize_model=is_optimize_model,
onnx_format=onnx_format,
is_use_cache_file=is_use_cache_file) is_use_cache_file=is_use_cache_file)
ptq.quantize() ptq.quantize()
ptq.save_quantized_model(self.int8_model_path) ptq.save_quantized_model(self.int8_model_path)
def run_test(self, model_name, model_url, model_md5, data_name, data_url, def run_test(self,
data_md5, algo, round_type, quantizable_op_type, model_name,
is_full_quantize, is_use_cache_file, is_optimize_model, model_url,
diff_threshold, infer_iterations, quant_iterations): model_md5,
data_name,
data_url,
data_md5,
algo,
round_type,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
infer_iterations,
quant_iterations,
onnx_format=False):
fp32_model_path = self.download_model(model_url, model_md5, model_name) fp32_model_path = self.download_model(model_url, model_md5, model_name)
fp32_model_path = os.path.join(fp32_model_path, model_name) fp32_model_path = os.path.join(fp32_model_path, model_name)
...@@ -211,10 +226,10 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -211,10 +226,10 @@ class TestPostTrainingQuantization(unittest.TestCase):
print("Start post training quantization for {0} on {1} samples ...". print("Start post training quantization for {0} on {1} samples ...".
format(model_name, quant_iterations)) format(model_name, quant_iterations))
self.generate_quantized_model(fp32_model_path, data_path, algo, self.generate_quantized_model(
round_type, quantizable_op_type, fp32_model_path, data_path, algo, round_type, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_full_quantize, is_use_cache_file, is_optimize_model,
is_optimize_model, quant_iterations) quant_iterations, onnx_format)
print("Start INT8 inference for {0} on {1} samples ...".format( print("Start INT8 inference for {0} on {1} samples ...".format(
model_name, infer_iterations)) model_name, infer_iterations))
...@@ -278,5 +293,42 @@ class TestPostTrainingKLForMnistAdaround(TestPostTrainingQuantization): ...@@ -278,5 +293,42 @@ class TestPostTrainingKLForMnistAdaround(TestPostTrainingQuantization):
diff_threshold, infer_iterations, quant_iterations) diff_threshold, infer_iterations, quant_iterations)
class TestPostTrainingKLForMnistONNXFormat(TestPostTrainingQuantization):
def test_post_training_kl_onnx_format(self):
model_name = "nlp_lstm_fp32_model"
model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
data_name = "quant_lstm_input_data"
data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
data_md5 = "add84c754e9b792fea1fbd728d134ab7"
algo = "KL"
round_type = "round"
quantizable_op_type = ["mul", "lstm"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = False
diff_threshold = 0.01
infer_iterations = 100
quant_iterations = 10
onnx_format = True
self.run_test(
model_name,
model_url,
model_md5,
data_name,
data_url,
data_md5,
algo,
round_type,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
infer_iterations,
quant_iterations,
onnx_format=onnx_format)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -116,7 +116,8 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -116,7 +116,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
is_use_cache_file=False, is_use_cache_file=False,
is_optimize_model=False, is_optimize_model=False,
batch_size=10, batch_size=10,
batch_nums=10): batch_nums=10,
onnx_format=False):
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
...@@ -134,6 +135,7 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -134,6 +135,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
round_type=round_type, round_type=round_type,
is_full_quantize=is_full_quantize, is_full_quantize=is_full_quantize,
optimize_model=is_optimize_model, optimize_model=is_optimize_model,
onnx_format=onnx_format,
is_use_cache_file=is_use_cache_file) is_use_cache_file=is_use_cache_file)
ptq.quantize() ptq.quantize()
ptq.save_quantized_model(self.int8_model_path) ptq.save_quantized_model(self.int8_model_path)
...@@ -151,7 +153,8 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -151,7 +153,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
diff_threshold, diff_threshold,
batch_size=10, batch_size=10,
infer_iterations=10, infer_iterations=10,
quant_iterations=5): quant_iterations=5,
onnx_format=False):
origin_model_path = self.download_model(data_url, data_md5, model_name) origin_model_path = self.download_model(data_url, data_md5, model_name)
origin_model_path = os.path.join(origin_model_path, model_name) origin_model_path = os.path.join(origin_model_path, model_name)
...@@ -166,7 +169,7 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -166,7 +169,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
self.generate_quantized_model(origin_model_path, algo, round_type, self.generate_quantized_model(origin_model_path, algo, round_type,
quantizable_op_type, is_full_quantize, quantizable_op_type, is_full_quantize,
is_use_cache_file, is_optimize_model, is_use_cache_file, is_optimize_model,
batch_size, quant_iterations) batch_size, quant_iterations, onnx_format)
print("Start INT8 inference for {0} on {1} images ...".format( print("Start INT8 inference for {0} on {1} images ...".format(
model_name, infer_iterations * batch_size)) model_name, infer_iterations * batch_size))
...@@ -335,5 +338,72 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization): ...@@ -335,5 +338,72 @@ class TestPostTrainingmseAdaroundForMnist(TestPostTrainingQuantization):
infer_iterations, quant_iterations) infer_iterations, quant_iterations)
class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
def test_post_training_mse_onnx_format(self):
model_name = "mnist_model"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
algo = "mse"
round_type = "round"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
onnx_format = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(
model_name,
data_url,
data_md5,
algo,
round_type,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
batch_size,
infer_iterations,
quant_iterations,
onnx_format=onnx_format)
class TestPostTrainingmseForMnistONNXFormatFullQuant(
TestPostTrainingQuantization):
def test_post_training_mse_onnx_format_full_quant(self):
model_name = "mnist_model"
data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
algo = "mse"
round_type = "round"
quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
is_full_quantize = True
is_use_cache_file = False
is_optimize_model = False
onnx_format = True
diff_threshold = 0.01
batch_size = 10
infer_iterations = 50
quant_iterations = 5
self.run_test(
model_name,
data_url,
data_md5,
algo,
round_type,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
batch_size,
infer_iterations,
quant_iterations,
onnx_format=onnx_format)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -243,7 +243,8 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -243,7 +243,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
round_type="round", round_type="round",
is_full_quantize=False, is_full_quantize=False,
is_use_cache_file=False, is_use_cache_file=False,
is_optimize_model=False): is_optimize_model=False,
onnx_format=False):
try: try:
os.system("mkdir " + self.int8_model) os.system("mkdir " + self.int8_model)
except Exception as e: except Exception as e:
...@@ -265,13 +266,23 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -265,13 +266,23 @@ class TestPostTrainingQuantization(unittest.TestCase):
round_type=round_type, round_type=round_type,
is_full_quantize=is_full_quantize, is_full_quantize=is_full_quantize,
optimize_model=is_optimize_model, optimize_model=is_optimize_model,
onnx_format=onnx_format,
is_use_cache_file=is_use_cache_file) is_use_cache_file=is_use_cache_file)
ptq.quantize() ptq.quantize()
ptq.save_quantized_model(self.int8_model) ptq.save_quantized_model(self.int8_model)
def run_test(self, model, algo, round_type, data_urls, data_md5s, def run_test(self,
quantizable_op_type, is_full_quantize, is_use_cache_file, model,
is_optimize_model, diff_threshold): algo,
round_type,
data_urls,
data_md5s,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
onnx_format=False):
infer_iterations = self.infer_iterations infer_iterations = self.infer_iterations
batch_size = self.batch_size batch_size = self.batch_size
sample_iterations = self.sample_iterations sample_iterations = self.sample_iterations
...@@ -285,9 +296,10 @@ class TestPostTrainingQuantization(unittest.TestCase): ...@@ -285,9 +296,10 @@ class TestPostTrainingQuantization(unittest.TestCase):
print("Start INT8 post training quantization for {0} on {1} images ...". print("Start INT8 post training quantization for {0} on {1} images ...".
format(model, sample_iterations * batch_size)) format(model, sample_iterations * batch_size))
self.generate_quantized_model( self.generate_quantized_model(model_cache_folder + "/model",
model_cache_folder + "/model", quantizable_op_type, algo, quantizable_op_type, algo, round_type,
round_type, is_full_quantize, is_use_cache_file, is_optimize_model) is_full_quantize, is_use_cache_file,
is_optimize_model, onnx_format)
print("Start INT8 inference for {0} on {1} images ...".format( print("Start INT8 inference for {0} on {1} images ...".format(
model, infer_iterations * batch_size)) model, infer_iterations * batch_size))
...@@ -517,5 +529,38 @@ class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization): ...@@ -517,5 +529,38 @@ class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
is_optimize_model, diff_threshold) is_optimize_model, diff_threshold)
class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
def test_post_training_onnx_format_mobilenetv1(self):
model = "MobileNet-V1"
algo = "avg"
round_type = "round"
data_urls = [
'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
]
data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
quantizable_op_type = [
"conv2d",
"depthwise_conv2d",
"mul",
]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = True
onnx_format = True
diff_threshold = 0.05
self.run_test(
model,
algo,
round_type,
data_urls,
data_md5s,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
onnx_format=onnx_format)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -39,5 +39,34 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization): ...@@ -39,5 +39,34 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
is_optimize_model, diff_threshold) is_optimize_model, diff_threshold)
class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
def test_post_training_resnet50(self):
model = "ResNet-50"
algo = "min_max"
round_type = "round"
data_urls = [
'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
]
data_md5s = ['4a5194524823d9b76da6e738e1367881']
quantizable_op_type = ["conv2d", "mul"]
is_full_quantize = False
is_use_cache_file = False
is_optimize_model = False
diff_threshold = 0.025
onnx_format = True
self.run_test(
model,
algo,
round_type,
data_urls,
data_md5s,
quantizable_op_type,
is_full_quantize,
is_use_cache_file,
is_optimize_model,
diff_threshold,
onnx_format=onnx_format)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -21,6 +21,7 @@ import six ...@@ -21,6 +21,7 @@ import six
import paddle import paddle
from paddle.fluid.framework import IrGraph from paddle.fluid.framework import IrGraph
from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
from paddle.fluid.contrib.slim.quantization import QuantizationTransformPassV2
from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass from paddle.fluid.contrib.slim.quantization import ConvertToInt8Pass
from paddle.fluid.contrib.slim.quantization import TransformForMobilePass from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
...@@ -686,5 +687,129 @@ class TestAddQuantDequantPass(unittest.TestCase): ...@@ -686,5 +687,129 @@ class TestAddQuantDequantPass(unittest.TestCase):
for_ci=True) for_ci=True)
class TestQuantizationTransformPassV2(unittest.TestCase):
def setUp(self):
self.quantizable_op_and_inputs = {
'conv2d': ['Input', 'Filter'],
'depthwise_conv2d': ['Input', 'Filter'],
'mul': ['X', 'Y']
}
self.quantizable_grad_op_inputs = {
'conv2d_grad': ['Input', 'Filter'],
'depthwise_conv2d_grad': ['Input', 'Filter'],
'mul_grad': ['X', 'Y']
}
def check_program(self, program):
quantized_ops = set()
for block in program.blocks:
for op in block.ops:
# check forward
if op.type in self.quantizable_op_and_inputs:
for arg_name in op.input_arg_names:
self.assertTrue(
arg_name.endswith('.quantized.dequantized'))
quantized_ops.add(arg_name)
for op in block.ops:
# check backward
if op.type in self.quantizable_grad_op_inputs:
for pname in self.quantizable_grad_op_inputs[op.type]:
arg_name = op.input(pname)[0]
self.assertTrue(
arg_name.endswith('.quantized.dequantized'))
self.assertTrue(arg_name in quantized_ops)
def linear_fc_quant(self,
activation_quant_type,
weight_quantize_type,
for_ci=True):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = linear_fc(3)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
place = fluid.CPUPlace()
graph = IrGraph(core.Graph(main.desc), for_test=False)
transform_pass = QuantizationTransformPassV2(
scope=fluid.global_scope(),
place=place,
activation_quantize_type=activation_quant_type,
weight_quantize_type=weight_quantize_type)
transform_pass.apply(graph)
if not for_ci:
marked_nodes = set()
for op in graph.all_op_nodes():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
graph.draw('.', 'quantize_fc_' + activation_quant_type,
marked_nodes)
program = graph.to_program()
self.check_program(program)
val_graph = IrGraph(core.Graph(program.desc), for_test=False)
if not for_ci:
val_marked_nodes = set()
for op in val_graph.all_op_nodes():
if op.name().find('quantize') > -1:
val_marked_nodes.add(op)
val_graph.draw('.', 'val_fc_' + activation_quant_type,
val_marked_nodes)
def test_linear_fc_quant_abs_max(self):
self.linear_fc_quant('abs_max', 'abs_max', for_ci=True)
def test_linear_fc_quant_channel_wise_abs_max(self):
self.linear_fc_quant('abs_max', 'channel_wise_abs_max', for_ci=True)
def residual_block_quant(self,
activation_quant_type,
weight_quantize_type,
quantizable_op_type,
for_ci=True):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = residual_block(2)
opt = fluid.optimizer.Adam(learning_rate=0.001)
opt.minimize(loss)
place = fluid.CPUPlace()
graph = IrGraph(core.Graph(main.desc), for_test=False)
transform_pass = QuantizationTransformPass(
scope=fluid.global_scope(),
place=place,
activation_quantize_type=activation_quant_type,
weight_quantize_type=weight_quantize_type,
quantizable_op_type=quantizable_op_type)
transform_pass.apply(graph)
if not for_ci:
marked_nodes = set()
for op in graph.all_op_nodes():
if op.name().find('quantize') > -1:
marked_nodes.add(op)
graph.draw('.', 'quantize_residual_' + activation_quant_type,
marked_nodes)
program = graph.to_program()
self.check_program(program)
val_graph = IrGraph(core.Graph(program.desc), for_test=False)
if not for_ci:
val_marked_nodes = set()
for op in val_graph.all_op_nodes():
if op.name().find('quantize') > -1:
val_marked_nodes.add(op)
val_graph.draw('.', 'val_residual_' + activation_quant_type,
val_marked_nodes)
def test_residual_block_abs_max(self):
quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
self.residual_block_quant(
'abs_max', 'abs_max', quantizable_op_type, for_ci=True)
def test_residual_block_channel_wise_abs_max(self):
quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul', 'matmul']
self.residual_block_quant(
'abs_max', 'channel_wise_abs_max', quantizable_op_type, for_ci=True)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -172,5 +172,83 @@ class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp): ...@@ -172,5 +172,83 @@ class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
self.data_type = "float32" self.data_type = "float32"
class TestChannelWiseDequantizeOp(OpTest):
def set_args(self):
self.bit_length = 8
self.data_type = "float32"
self.quant_axis = 0
def setUp(self):
self.set_args()
self.op_type = "dequantize_linear"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scale = channel_wise_quantize_max_abs(x, self.bit_length,
self.quant_axis)
ydq = channel_wise_dequantize_max_abs(yq, scale, self.bit_length,
self.quant_axis)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
print('TestChannelWiseDequantizeOp:')
self.inputs = {'X': yq, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis
}
self.outputs = {'Y': ydq}
def test_check_output(self):
self.check_output()
class TestChannelWiseDequantizeOp1(TestChannelWiseDequantizeOp):
def set_args(self):
self.bit_length = 8
self.data_type = "float32"
self.quant_axis = 1
class TestDequantizeOp(OpTest):
def set_args(self):
self.bit_length = 8
self.quant_axis = -1
self.max_range = math.pow(2, self.bit_length - 1) - 1
self.data_type = "float32"
def setUp(self):
self.set_args()
self.op_type = "dequantize_linear"
x = np.random.randn(31, 65).astype(self.data_type)
yq, scale = quantize_max_abs(x, self.max_range)
ydq = dequantize_max_abs(yq, scale, self.max_range)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
self.inputs = {'X': yq, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis
}
self.outputs = {'Y': ydq}
def test_check_output(self):
self.check_output()
class TestDequantizeOpDouble(TestDequantizeOp):
def set_args(self):
self.bit_length = 8
self.max_range = math.pow(2, self.bit_length - 1) - 1
self.data_type = "float64"
self.quant_axis = -1
class TestDequantizeOp5Bits(TestDequantizeOp):
def set_args(self):
self.bit_length = 5
self.max_range = math.pow(2, self.bit_length - 1) - 1
self.data_type = "float32"
self.quant_axis = -1
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
import math
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -374,5 +375,144 @@ class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp): ...@@ -374,5 +375,144 @@ class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
self.inputs = {'X': np.random.random((30, 15)).astype("float32"), } self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
def quantize_max_abs(x, max_range):
scale = np.max(np.abs(x).flatten())
y = np.round(x / scale * max_range)
return y, scale
def channel_wise_quantize_max_abs(x, quant_bit=8, quant_axis=0):
assert quant_axis in [0, 1], "The quant_axis should be 0 or 1."
scales = []
y = x.copy()
max_range = math.pow(2, quant_bit - 1) - 1
if quant_axis == 0:
for i in range(x.shape[0]):
scale = np.max(np.abs(x[i])).astype("float32")
scales.append(scale)
y[i] = np.round(x[i] * max_range / scale)
elif quant_axis == 1:
for i in range(x.shape[1]):
scale = np.max(np.abs(x[:, i])).astype("float32")
scales.append(scale)
y[:, i] = np.round(x[:, i] * max_range / scale)
return y, scales
class TestChannelWiseQuantizeOp(OpTest):
def set_args(self):
self.bit_length = 8
self.data_type = "float32"
self.quant_axis = 0
def setUp(self):
self.set_args()
self.op_type = "quantize_linear"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scale = channel_wise_quantize_max_abs(x, self.bit_length,
self.quant_axis)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis
}
self.outputs = {'Y': yq}
def test_check_output(self):
self.check_output()
class TestChannelWiseQuantizeOp1(TestChannelWiseQuantizeOp):
def set_args(self):
self.bit_length = 8
self.data_type = "float32"
self.quant_axis = 1
class TestChannelWiseQuantizeOpTrain(OpTest):
def set_args(self):
self.bit_length = 8
self.data_type = "float32"
self.quant_axis = 0
self.is_test = False
def setUp(self):
self.set_args()
self.op_type = "quantize_linear"
x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
yq, scale = channel_wise_quantize_max_abs(x, self.bit_length,
self.quant_axis)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis,
'is_test': self.is_test
}
self.outputs = {'Y': yq, 'OutScale': scale}
def test_check_output(self):
self.check_output()
class TestquantizeOp(OpTest):
def set_args(self):
self.bit_length = 8
self.quant_axis = -1
self.max_range = math.pow(2, self.bit_length - 1) - 1
self.data_type = "float32"
def setUp(self):
self.set_args()
self.op_type = "quantize_linear"
x = np.random.randn(31, 65).astype(self.data_type)
yq, scale = quantize_max_abs(x, self.max_range)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis,
}
self.outputs = {'Y': yq}
def test_check_output(self):
self.check_output()
class TestquantizeOpTrain(TestquantizeOp):
def set_args(self):
self.bit_length = 8
self.quant_axis = -1
self.max_range = math.pow(2, self.bit_length - 1) - 1
self.data_type = "float32"
self.is_test = False
def setUp(self):
self.set_args()
self.op_type = "quantize_linear"
x = np.random.randn(31, 65).astype(self.data_type)
yq, scale = quantize_max_abs(x, self.max_range)
scale = np.array(scale).astype(self.data_type)
zero_point = np.zeros(scale.shape, dtype="int32")
self.inputs = {'X': x, 'Scale': scale, 'ZeroPoint': zero_point}
self.attrs = {
'bit_length': self.bit_length,
'quant_axis': self.quant_axis,
'is_test': self.is_test
}
self.outputs = {'Y': yq, 'OutScale': scale}
def test_check_output(self):
self.check_output()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册