未验证 提交 ff70a269 编写于 作者: G Guanghua Yu 提交者: GitHub

[cherry-pick]Update quantization round and clip calculation methods (#43829)

* update quantization clip and round

* fix quantization clip and round Attribute

* fix typo
上级 9e776f62
......@@ -45,6 +45,10 @@ DeleteQuantDequantFilterOpPass::DeleteQuantDequantFilterOpPass() {
.End()
.AddAttr("bit_length")
.IsIntIn({8, 16})
.End()
.AddAttr("round_type")
.IsOptional()
.IsIntIn({0, 1})
.End();
AddOpCompat(OpCompat("fake_channel_wise_quantize_dequantize_abs_max"))
.AddInput("X")
......@@ -61,6 +65,10 @@ DeleteQuantDequantFilterOpPass::DeleteQuantDequantFilterOpPass() {
.End()
.AddAttr("quant_axis")
.IsIntIn({0, 1})
.End()
.AddAttr("round_type")
.IsOptional()
.IsIntIn({0, 1})
.End();
}
// Delete quant_dequant_op, then quantize and dequantize weight
......@@ -96,14 +104,17 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
auto var_map = any_op2_desc->Inputs();
std::string arg_name = "";
for (auto& name_m : var_map) {
if (std::find(name_m.second.begin(), name_m.second.end(),
if (std::find(name_m.second.begin(),
name_m.second.end(),
quant_dequant_op_out_name) != name_m.second.end()) {
arg_name = name_m.first;
break;
}
}
PADDLE_ENFORCE_GT(arg_name.size(), 0, platform::errors::InvalidArgument(
"can not find the input %s.",
PADDLE_ENFORCE_GT(
arg_name.size(),
0,
platform::errors::InvalidArgument("can not find the input %s.",
quant_dequant_op_out_name));
// any_op2_desc->SetAttr("enable_int8", true);
any_op2_desc->SetAttr("bit_length", bit_length);
......@@ -123,7 +134,8 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
if (dequant_type == "fake_channel_wise_quantize_dequantize_abs_max") {
int quant_axis =
BOOST_GET_CONST(int, quant_dequant_op->Op()->GetAttr("quant_axis"));
PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1,
true,
platform::errors::InvalidArgument(
"'quant_axis' should be 0 or 1, but "
"the received is %d",
......@@ -176,7 +188,8 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
}
}
for (int i = 0; i < channel; i++) {
PADDLE_ENFORCE_NE(weight_scale[i], 0,
PADDLE_ENFORCE_NE(weight_scale[i],
0,
platform::errors::InvalidArgument(
"Weight scale should be nonzero, but get zero."));
weight_scale[i] = weight_scale[i] / range;
......@@ -188,7 +201,8 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
abs_max_weight =
std::max(abs_max_weight, std::abs(quantized_weight_data[j]));
}
PADDLE_ENFORCE_NE(abs_max_weight, 0,
PADDLE_ENFORCE_NE(abs_max_weight,
0,
platform::errors::InvalidArgument(
"Weight scale should be nonzero, but get zero"));
weight_scale.push_back(abs_max_weight / range);
......
......@@ -54,6 +54,10 @@ DeleteQuantDequantLinearOpPass::DeleteQuantDequantLinearOpPass() {
.End()
.AddAttr("quant_axis")
.IsType<int>()
.End()
.AddAttr("round_type")
.IsOptional()
.IsType<int>()
.End();
AddOpCompat(OpCompat("dequantize_linear"))
.AddInput("X")
......@@ -74,6 +78,10 @@ DeleteQuantDequantLinearOpPass::DeleteQuantDequantLinearOpPass() {
.End()
.AddAttr("quant_axis")
.IsType<int>()
.End()
.AddAttr("round_type")
.IsOptional()
.IsType<int>()
.End();
}
// Delete quantize_linear_op dequantize_linear_op, then add input_scales
......@@ -112,7 +120,8 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
const LoDTensor& input_scale_tensor =
scope->GetVar(quantize_linear_op_scale->Name())->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(input_scale_tensor.place()), true,
paddle::platform::is_cpu_place(input_scale_tensor.place()),
true,
platform::errors::InvalidArgument(
"Input scale tensor's place should be CPU."));
const float* input_scale_data = input_scale_tensor.data<float>();
......
......@@ -52,6 +52,10 @@ DeleteWeightQuantDequantLinearOpPass::DeleteWeightQuantDequantLinearOpPass() {
.End()
.AddAttr("quant_axis")
.IsType<int>()
.End()
.AddAttr("round_type")
.IsOptional()
.IsType<int>()
.End();
AddOpCompat(OpCompat("dequantize_linear"))
.AddInput("X")
......@@ -72,6 +76,10 @@ DeleteWeightQuantDequantLinearOpPass::DeleteWeightQuantDequantLinearOpPass() {
.End()
.AddAttr("quant_axis")
.IsType<int>()
.End()
.AddAttr("round_type")
.IsOptional()
.IsType<int>()
.End();
AddOpCompat(OpCompat("conv2d"))
.AddInput("Input")
......@@ -322,7 +330,8 @@ void DeleteWeightQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
int quant_axis = BOOST_GET_CONST(
int, weight_dequantize_linear_op->Op()->GetAttr("quant_axis"));
if (quant_axis == -1) { // per_layer quant_dequant: all OP
PADDLE_ENFORCE_EQ(weight_scale_nums, 1,
PADDLE_ENFORCE_EQ(weight_scale_nums,
1,
platform::errors::InvalidArgument(
"When quant_axis == -1 means use per_layer "
"quant_dequant, weight_scale'number should be 1."));
......@@ -335,11 +344,13 @@ void DeleteWeightQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
} else if (quant_axis == 0) { // per_channel quant_dequant: conv2d,
// depthwise_conv2d, conv2d_fusion
PADDLE_ENFORCE_EQ(
weight_scale_nums, w_dims[quant_axis],
weight_scale_nums,
w_dims[quant_axis],
platform::errors::InvalidArgument(
"When quant_axis == 0 means use per_channel quant_dequant, "
"weight_scale'numbers should be equal channels."));
PADDLE_ENFORCE_EQ(w_dims.size(), 4,
PADDLE_ENFORCE_EQ(w_dims.size(),
4,
platform::errors::InvalidArgument(
"When quant_axis == 0 means use per_channel "
"quant_dequant, (conv2d, depthwise_conv2d, "
......@@ -352,7 +363,8 @@ void DeleteWeightQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
}
} else if (quant_axis == 1) {
PADDLE_ENFORCE_EQ(
weight_scale_nums, w_dims[quant_axis],
weight_scale_nums,
w_dims[quant_axis],
platform::errors::InvalidArgument(
"When quant_axis == 1 means use per_channel quant_dequant, "
"weight_scale'numbers should be equal channels."));
......@@ -360,7 +372,8 @@ void DeleteWeightQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
if (w_dims.size() == 4) { // conv2d_transpose
std::string quantized_op_type = any_op2->Op()->Type();
PADDLE_ENFORCE_EQ(
quantized_op_type, "conv2d_transpose",
quantized_op_type,
"conv2d_transpose",
platform::errors::InvalidArgument(
"When quant_axis == 1 means use per_channel quant_dequant, "
"only conv2d_transpose weight dims equal 4."));
......@@ -388,7 +401,8 @@ void DeleteWeightQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
weight_tensor->Resize(phi::make_ddim(phi::vectorize(w_dims)));
float* new_quantized_weight_data =
weight_tensor->mutable_data<float>(platform::CPUPlace());
memcpy(new_quantized_weight_data, weight_data_tmp.data(),
memcpy(new_quantized_weight_data,
weight_data_tmp.data(),
weight_tensor->numel() * sizeof(float));
nodes2rm.insert(weight_dequantize_linear_op_scale);
......
......@@ -49,6 +49,10 @@ QuantDequantFusePass::QuantDequantFusePass() {
.End()
.AddAttr("bit_length")
.IsIntIn({8, 16})
.End()
.AddAttr("round_type")
.IsOptional()
.IsIntIn({0, 1})
.End();
AddOpCompat(OpCompat("fake_quantize_moving_average_abs_max"))
.AddInput("X")
......@@ -85,6 +89,10 @@ QuantDequantFusePass::QuantDequantFusePass() {
.End()
.AddAttr("bit_length")
.IsIntIn({8, 16})
.End()
.AddAttr("round_type")
.IsOptional()
.IsIntIn({0, 1})
.End();
AddOpCompat(OpCompat("fake_dequantize_max_abs"))
.AddInput("X")
......@@ -309,7 +317,8 @@ QuantDequantFusePass::QuantDequantFusePass() {
}
// Delete quant op before quantized ops, and set input scale in the attr of
// quantized ops
void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope,
void QuantDequantFusePass::DeleteQuant(ir::Graph* graph,
Scope* scope,
const std::string& quant_type) const {
const std::string pattern_name = "delete_quant_fuse";
GraphPatternDetector gpd;
......@@ -331,7 +340,8 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope,
return;
}
PADDLE_ENFORCE_EQ(
subgraph.count(input_act_node), true,
subgraph.count(input_act_node),
true,
platform::errors::NotFound(
"Input act node(%s) not found in QuantDequantFuse pass.",
input_act_node->name()));
......@@ -345,12 +355,14 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope,
// Get input scale from tensor
std::string input_scale_var_name = quant->Op()->Input("InScale").front();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument(
scope,
platform::errors::InvalidArgument(
"Scope in QuantDequantFuse pass should not be null."));
const LoDTensor& input_scale_tensor =
scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(input_scale_tensor.place()), true,
paddle::platform::is_cpu_place(input_scale_tensor.place()),
true,
platform::errors::InvalidArgument(
"Input scale tensor's place should be CPU."));
const float* input_scale_data = input_scale_tensor.data<float>();
......@@ -382,8 +394,8 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope,
IR_NODE_LINK_TO(input_act, quantized_node);
}
// Delete nodes and edges
std::unordered_set<const Node*> nodes2rm = {input_scale, quant,
output_scale, output_act};
std::unordered_set<const Node*> nodes2rm = {
input_scale, quant, output_scale, output_act};
GraphSafeRemoveNodes(graph, nodes2rm);
};
gpd(graph, handler);
......@@ -391,7 +403,8 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope,
// Delete dequant op after quantized ops, and convert weight from fp32 range to
// int8 range
void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
Scope* scope,
const std::string& quantized_op_type,
const std::string& dequant_type) const {
std::string weight_name = "";
......@@ -436,7 +449,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
return;
}
PADDLE_ENFORCE_EQ(
subgraph.count(quantized_op_input), true,
subgraph.count(quantized_op_input),
true,
platform::errors::NotFound("Quantized op input node(%s) did not find "
"in QuantDequantFuse pass.",
quantized_op_input->name()));
......@@ -464,14 +478,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
subgraph.at(pattern.GetPDNode("dequant_channel_scale"));
auto scales_name = dequant_op_node->Op()->Input("Scales");
PADDLE_ENFORCE_EQ(
scales_name.size(), 2,
scales_name.size(),
2,
platform::errors::InvalidArgument(
"Scales size in channel-wise dequantize op should be 2, got %d.",
scales_name.size()));
const LoDTensor& channel_scale_tensor =
scope->FindVar(scales_name[0])->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
paddle::platform::is_cpu_place(channel_scale_tensor.place()), true,
paddle::platform::is_cpu_place(channel_scale_tensor.place()),
true,
platform::errors::InvalidArgument(
"Channel scale tensor's place should be CPU."));
const float* channel_scale_data = channel_scale_tensor.data<float>();
......@@ -497,7 +513,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
quantized_op_type == "matmul_v2" || quantized_op_type == "fc") {
if (dequant_type == "fake_dequantize_max_abs") {
PADDLE_ENFORCE_EQ(weight_scale.size(), 1,
PADDLE_ENFORCE_EQ(weight_scale.size(),
1,
platform::errors::InvalidArgument(
"mul/matmul/matmul_v2 op weight dequantized by "
"[fake_dequantize_max_abs] "
......@@ -511,7 +528,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
if (quant_axis == 0) {
} else {
PADDLE_ENFORCE_EQ(
quant_axis == 1, true,
quant_axis == 1,
true,
platform::errors::InvalidArgument(
"'quant_axis' of mul/matmul/fc/matmul_v2 op weight "
"dequantized by "
......@@ -520,14 +538,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
quant_axis));
}
PADDLE_ENFORCE_EQ(
weight_scale.size(), static_cast<size_t>(w_dims[1]),
weight_scale.size(),
static_cast<size_t>(w_dims[1]),
platform::errors::InvalidArgument(
"mul/matmul/matmul_v2 op weight dequantized by "
"[fake_channel_wise_dequantize_max_abs] requires weight scale "
"size = 2nd dim of mul/matmul/matmul_v2's weight, which is %d, "
"but got "
"%d.",
static_cast<size_t>(w_dims[1]), weight_scale.size()));
static_cast<size_t>(w_dims[1]),
weight_scale.size()));
for (int j = 0; j < weight_tensor->numel(); j++) {
quantized_weight_data[j] *= weight_scale[j % w_dims[1]];
}
......@@ -535,7 +555,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
} else if (quantized_op_type == "conv2d" ||
quantized_op_type == "depthwise_conv2d") {
PADDLE_ENFORCE_EQ(
dequant_type, "fake_channel_wise_dequantize_max_abs",
dequant_type,
"fake_channel_wise_dequantize_max_abs",
platform::errors::InvalidArgument(
"conv2d op must be dequantized by "
"[fake_channel_wise_dequantize_max_abs], but got %s. "
......@@ -546,7 +567,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
if (quant_axis == 0) {
} else {
PADDLE_ENFORCE_EQ(
quant_axis == 0, true,
quant_axis == 0,
true,
platform::errors::InvalidArgument(
"'quant_axis' of conv2d/depthwise_conv2d op weight dequantized "
"by [fake_channel_wise_dequantize_max_abs]should be 0, but "
......@@ -554,18 +576,21 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
quant_axis));
}
PADDLE_ENFORCE_EQ(
weight_scale.size(), static_cast<size_t>(w_dims[0]),
weight_scale.size(),
static_cast<size_t>(w_dims[0]),
platform::errors::InvalidArgument(
"conv2d op requires weight scale size = channel size of the "
"weight, which is %d, but got %d.",
static_cast<size_t>(w_dims[0]), weight_scale.size()));
static_cast<size_t>(w_dims[0]),
weight_scale.size()));
for (int j = 0; j < weight_tensor->numel(); j++) {
int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
quantized_weight_data[j] *= weight_scale[j / inner_size];
}
} else if (quantized_op_type == "conv2d_transpose") {
PADDLE_ENFORCE_EQ(
dequant_type, "fake_channel_wise_dequantize_max_abs",
dequant_type,
"fake_channel_wise_dequantize_max_abs",
platform::errors::InvalidArgument(
"conv2d_transpose must be dequantized by "
"[fake_channel_wise_dequantize_max_abs], but got %s",
......@@ -573,7 +598,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
if (quant_axis == 0) {
} else {
PADDLE_ENFORCE_EQ(
quant_axis == 1, true,
quant_axis == 1,
true,
platform::errors::InvalidArgument(
"'quant_axis' of conv2d_transpose op weight dequantized by "
"[fake_channel_wise_dequantize_max_abs]should be 1, but "
......@@ -581,11 +607,13 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
quant_axis));
}
PADDLE_ENFORCE_EQ(
weight_scale.size(), static_cast<size_t>(w_dims[1]),
weight_scale.size(),
static_cast<size_t>(w_dims[1]),
platform::errors::InvalidArgument(
"conv2d_transpose op requires weight scale size = channel size "
"of the weight, which is %d, but got %d.",
static_cast<size_t>(w_dims[1]), weight_scale.size()));
static_cast<size_t>(w_dims[1]),
weight_scale.size()));
for (int j = 0; j < weight_tensor->numel(); j++) {
int inner_size = w_dims[2] * w_dims[3];
quantized_weight_data[j] *= weight_scale[(j / inner_size) % w_dims[1]];
......@@ -639,8 +667,13 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
std::unordered_set<std::string> quant_types = {
"fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
std::unordered_set<std::string> quantized_op_types = {
"conv2d", "mul", "matmul", "depthwise_conv2d",
"conv2d_transpose", "fc", "matmul_v2",
"conv2d",
"mul",
"matmul",
"depthwise_conv2d",
"conv2d_transpose",
"fc",
"matmul_v2",
};
auto* scope = param_scope();
......
......@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/quantize_linear_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/transform.h"
......@@ -24,14 +26,17 @@ namespace operators {
template <typename T>
struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& dev_ctx,
const framework::Tensor* in, const framework::Tensor* scale,
T max_range, const int quant_axis, framework::Tensor* out) {
void operator()(const platform::CPUDeviceContext &dev_ctx,
const framework::Tensor *in,
const framework::Tensor *scale,
T max_range,
const int quant_axis,
framework::Tensor *out) {
// Dequant op is before quantized op
// Dequantize the weight of quantized op
auto in_dims = in->dims();
const int64_t channel = in_dims[quant_axis];
const T* scale_factor = scale->data<T>();
const T *scale_factor = scale->data<T>();
if (quant_axis == 0) {
for (int64_t i = 0; i < channel; i++) {
T s = scale_factor[i];
......@@ -39,7 +44,7 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
framework::Tensor one_channel_out = out->Slice(i, i + 1);
auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
auto& dev = *dev_ctx.eigen_device();
auto &dev = *dev_ctx.eigen_device();
out_e.device(dev) = in_e * s / max_range;
}
} else if (quant_axis == 1) {
......@@ -49,12 +54,12 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
}
int64_t step_i = in->numel() / out_iter;
int64_t step_j = in->numel() / (out_iter * channel);
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
auto *in_data = in->data<T>();
auto *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
for (int64_t i = 0; i < out_iter; i++) {
for (int64_t j = 0; j < channel; j++) {
auto* cur_in = in_data + i * step_i + j * step_j;
auto* cur_out = out_data + i * step_i + j * step_j;
auto *cur_in = in_data + i * step_i + j * step_j;
auto *cur_out = out_data + i * step_i + j * step_j;
T s = scale_factor[j];
for (int64_t k = 0; k < step_j; k++) {
*cur_out = (*cur_in) * s / max_range;
......@@ -67,19 +72,17 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
}
};
template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, float>;
template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, double>;
class QuantizeLinearOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
void InferShape(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "QuantizeLinear");
OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "QuantizeLinear");
OP_INOUT_CHECK(ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint",
"QuantizeLinear");
OP_INOUT_CHECK(
ctx->HasInput("ZeroPoint"), "Input", "ZeroPoint", "QuantizeLinear");
OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "QuantizeLinear");
ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
int quant_axis = ctx->Attrs().Get<int>("quant_axis");
......@@ -95,7 +98,7 @@ class QuantizeLinearOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
......@@ -116,9 +119,10 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
"For conv2d, depthwise_conv2d, conv2d_transpose "
"and mul, the quant_axis is equal to the cout axis.")
.SetDefault(0)
.AddCustomChecker([](const int& quant_axis) {
.AddCustomChecker([](const int &quant_axis) {
PADDLE_ENFORCE_EQ(
quant_axis == 0 || quant_axis == 1 || quant_axis == -1, true,
quant_axis == 0 || quant_axis == 1 || quant_axis == -1,
true,
platform::errors::InvalidArgument(
"'quant_axis' should be 0 or 1, but "
"the received is %d",
......@@ -126,13 +130,32 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr<int>("bit_length", "(int, default 8)")
.SetDefault(8)
.AddCustomChecker([](const int& bit_length) {
PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
.AddCustomChecker([](const int &bit_length) {
PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16,
true,
platform::errors::InvalidArgument(
"'bit_length' should be between 1 and 16, but "
"the received is %d",
bit_length));
});
AddAttr<int>(
"round_type",
"(int, default 0) The round type of fp32 to int."
"0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
"1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
"round(2.5)=3")
.SetDefault(0)
.AddCustomChecker([](const int &round_type) {
PADDLE_ENFORCE_EQ(
round_type == 0 || round_type == 1,
true,
platform::errors::InvalidArgument(
"'round_type' should be 0 or 1, 0 rounding to "
"nearest ties to even and 1 is rounding to nearest "
"ties away from zero.but the received is %d",
round_type));
})
.AsExtra();
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
......@@ -156,14 +179,18 @@ namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(
quantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
quantize_linear,
ops::QuantizeLinearOp,
ops::QuantizeLinearOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(quantize_linear, ops::QuantizeLinearKernel<CPU, float>);
REGISTER_OPERATOR(
dequantize_linear, ops::QuantizeLinearOp, ops::QuantizeLinearOpMaker,
dequantize_linear,
ops::QuantizeLinearOp,
ops::QuantizeLinearOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
......
......@@ -29,9 +29,13 @@ namespace operators {
template <typename DeviceContext, typename T>
struct ChannelDequantizeFunctorV2 {
void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
const framework::Tensor** scales, const int scale_num,
T max_range, const int quant_axis, framework::Tensor* out);
void operator()(const DeviceContext& dev_ctx,
const framework::Tensor* in,
const framework::Tensor** scales,
const int scale_num,
T max_range,
const int quant_axis,
framework::Tensor* out);
};
template <typename DeviceContext, typename T>
......@@ -44,6 +48,7 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
auto* out = context.Output<framework::Tensor>("Y");
out->mutable_data<T>(context.GetPlace());
int bit_length = context.Attr<int>("bit_length");
int round_type = context.Attr<int>("round_type");
int bin_cnt = std::pow(2, bit_length - 1) - 1;
int quant_axis = context.Attr<int>("quant_axis");
bool is_test = context.Attr<bool>("is_test");
......@@ -53,25 +58,25 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
if (!is_test) {
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_s = out_scale->mutable_data<T>(context.GetPlace());
FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(),
in->numel(), out_s);
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
bin_cnt, out);
FindAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, in->data<T>(), in->numel(), out_s);
ClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
} else {
ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
bin_cnt, out);
ClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *in_scale, bin_cnt, round_type, out);
}
} else {
if (!is_test) {
auto* out_scale = context.Output<framework::Tensor>("OutScale");
T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
out_scale_data);
FindChannelAbsMaxFunctor<DeviceContext, T>()(
dev_ctx, *in, quant_axis, out_scale_data);
ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
} else {
ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
dev_ctx, *in, *in_scale, bin_cnt, quant_axis, out);
dev_ctx, *in, *in_scale, bin_cnt, round_type, quant_axis, out);
}
}
}
......@@ -87,7 +92,8 @@ class DeQuantizeLinearKernel : public framework::OpKernel<T> {
auto in_tmp = phi::Cast<T>(
static_cast<const typename paddle::framework::ConvertToPhiContext<
DeviceContext>::TYPE&>(dev_ctx),
*in, experimental::CppTypeToDataType<D>::Type());
*in,
experimental::CppTypeToDataType<D>::Type());
auto* scale = context.Input<framework::Tensor>("Scale");
auto* out = context.Output<framework::Tensor>("Y");
......@@ -97,16 +103,18 @@ class DeQuantizeLinearKernel : public framework::OpKernel<T> {
if (quant_axis < 0) {
float max_range = (std::pow(2, bit_length - 1) - 1);
DequantizeFunctor<DeviceContext, D>()(dev_ctx, &in_tmp, scale,
static_cast<D>(max_range), out);
DequantizeFunctor<DeviceContext, D>()(
dev_ctx, &in_tmp, scale, static_cast<D>(max_range), out);
} else {
PADDLE_ENFORCE_EQ(
scale->numel(), in_tmp.dims()[quant_axis],
scale->numel(),
in_tmp.dims()[quant_axis],
platform::errors::PreconditionNotMet(
"The number of first scale values must be the same with "
"quant_axis dimension value of Input(X) when the `scale` has "
"only one element, but %ld != %ld here.",
scale->numel(), in_tmp.dims()[quant_axis]));
scale->numel(),
in_tmp.dims()[quant_axis]));
int max_range = (std::pow(2, bit_length - 1) - 1);
ChannelDequantizeFunctorV2<DeviceContext, D>()(
......
......@@ -20,26 +20,31 @@ import logging
import paddle.fluid as fluid
from ....log_helper import get_logger
from .utils import load_variable_data, set_variable_data, stable_sigmoid, quant_tensor, dequant_tensor, _channelwise_quant_axis1_ops, calculate_quant_cos_error
from .utils import load_variable_data, set_variable_data, stable_sigmoid, quant_tensor, dequant_tensor, _channelwise_quant_axis1_ops, calculate_quant_cos_error, bias_correction_w
_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
_logger = get_logger(__name__,
logging.INFO,
fmt='%(asctime)s-%(levelname)s: %(message)s')
GAMMA = -0.1
ZETA = 1.1
def compute_soft_rounding(alpha_v):
return fluid.layers.clip(
fluid.layers.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, min=0, max=1)
return fluid.layers.clip(fluid.layers.sigmoid(alpha_v) * (ZETA - GAMMA) +
GAMMA,
min=0,
max=1)
def compute_soft_rounding_np(alpha_v):
return np.clip(
stable_sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, a_min=0, a_max=1)
return np.clip(stable_sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA,
a_min=0,
a_max=1)
class AdaRoundLoss(object):
def __init__(self, reg_param=0.01, default_beta_range=(20, 2)):
self.default_reg_param = reg_param
self.default_beta_range = default_beta_range
......@@ -48,26 +53,29 @@ class AdaRoundLoss(object):
square_cost = fluid.layers.square_error_cost(ada_quantized_output,
orig_output)
recon_loss = fluid.layers.reduce_mean(
fluid.layers.reduce_sum(
square_cost, dim=-1))
fluid.layers.reduce_sum(square_cost, dim=-1))
return recon_loss
def compute_round_loss(self, alpha_v, warm_start, beta):
def round_loss_fn():
# compute rectified sigmoid of parameter 'alpha' which maps it between zero and one
h_v = compute_soft_rounding(alpha_v)
# calculate regularization term - which ensures parameter to converge to exactly zeros and ones
# at the end of optimization
reg_term = fluid.layers.reduce_sum(-fluid.layers.pow(
fluid.layers.abs(2 * h_v - 1), factor=beta) + 1)
reg_term = fluid.layers.reduce_sum(
-fluid.layers.pow(fluid.layers.abs(2 * h_v - 1), factor=beta) +
1)
# calculate the rounding loss
round_loss = self.default_reg_param * reg_term
return round_loss
round_loss = fluid.layers.cond(warm_start, lambda: fluid.layers.fill_constant(shape=[1], dtype='float32', value=0.0), round_loss_fn)
round_loss = fluid.layers.cond(
warm_start, lambda: fluid.layers.fill_constant(
shape=[1], dtype='float32', value=0.0), round_loss_fn)
return round_loss
......@@ -80,15 +88,16 @@ class AdaRoundLoss(object):
warm_start_end_iter = warm_start * max_iter
# compute relative iteration of current iteration
rel_iter = (cur_iter - warm_start_end_iter) / (
max_iter - warm_start_end_iter)
beta = end_beta + 0.5 * (start_beta - end_beta) * (1 + np.cos(rel_iter *
np.pi))
rel_iter = (cur_iter - warm_start_end_iter) / (max_iter -
warm_start_end_iter)
beta = end_beta + 0.5 * (start_beta -
end_beta) * (1 + np.cos(rel_iter * np.pi))
return beta
class AdaRound(object):
def __init__(self,
scale,
weight_tensor,
......@@ -145,8 +154,7 @@ class AdaRound(object):
h_alpha = compute_soft_rounding_np(np_alpha)
# Scale the tensor
tensor_scale = quant_tensor(
self.ori_weight_tensor.copy(),
tensor_scale = quant_tensor(self.ori_weight_tensor.copy(),
self.scale,
quant_axis=self.quant_axis)
......@@ -160,8 +168,8 @@ class AdaRound(object):
weight_tensor_quant = self._calculate_quant_weight()
# Dequantize the tensor
weight_tensor_dequant = dequant_tensor(
weight_tensor_quant + self.offset,
weight_tensor_dequant = dequant_tensor(weight_tensor_quant +
self.offset,
self.scale,
quant_axis=self.quant_axis)
return weight_tensor_dequant
......@@ -171,10 +179,10 @@ class AdaRound(object):
return weight_tensor_quant
def get_loss(self, beta, warm_start, adaround_out_tensor, orig_out_tensor):
round_loss = self.adaround_loss.compute_round_loss(self.alpha_v,
warm_start, beta)
recon_loss = self.adaround_loss.compute_recon_loss(adaround_out_tensor,
orig_out_tensor)
round_loss = self.adaround_loss.compute_round_loss(
self.alpha_v, warm_start, beta)
recon_loss = self.adaround_loss.compute_recon_loss(
adaround_out_tensor, orig_out_tensor)
loss = round_loss + recon_loss
losses = {
'loss': loss,
......@@ -201,6 +209,7 @@ def run_adaround(data_loader,
scale_dict,
num_iterations=1000,
lr=0.001,
bias_correction=False,
fast_mode=True):
fetch_op_name = fetch_list[0].name
final_weight_tensor_quant_dict = {}
......@@ -226,28 +235,28 @@ def run_adaround(data_loader,
with fluid.program_guard(train_program, startup_program):
with fluid.unique_name.guard():
# initialize adaround
adaround = AdaRound(
scale,
adaround = AdaRound(scale,
weight_var_tensor,
scope=scope,
weight_var_name=weight_var_name,
weight_op_type=weight_op_type,
num_iterations=num_iterations)
orig_out_tensor = fluid.data(
name='orig_out_tensor',
orig_out_tensor = fluid.data(name='orig_out_tensor',
shape=fp32_fetch_list.shape,
dtype='float32')
adaround_out_tensor = fluid.data(
name='adaround_out_tensor',
adaround_out_tensor = fluid.data(name='adaround_out_tensor',
shape=fp32_fetch_list.shape,
dtype='float32')
beta_tensor = fluid.data(
name='beta', shape=[1], dtype='float32')
warm_start_tensor = fluid.data(
name='warm_start', shape=[1], dtype='bool')
beta_tensor = fluid.data(name='beta',
shape=[1],
dtype='float32')
warm_start_tensor = fluid.data(name='warm_start',
shape=[1],
dtype='bool')
train_fetches_loss = adaround.get_loss(
beta_tensor, warm_start_tensor, adaround_out_tensor,
train_fetches_loss = adaround.get_loss(beta_tensor,
warm_start_tensor,
adaround_out_tensor,
orig_out_tensor)
optimizer = fluid.optimizer.Adam(learning_rate=lr)
loss = train_fetches_loss['loss']
......@@ -291,16 +300,23 @@ def run_adaround(data_loader,
fetch_list=[v.name for v in train_fetches_loss.values()],
return_numpy=True)
_logger.info(
"Iter {:d}, lr {:.5f}, loss {:.5f}, loss_round {:.5f}, loss_recon {:.5f}, time {:.5f}s".
format(i, lr,
np.mean(out[0]),
np.mean(out[1]),
"Iter {:d}, lr {:.5f}, loss {:.5f}, loss_round {:.5f}, loss_recon {:.5f}, time {:.5f}s"
.format(i, lr, np.mean(out[0]), np.mean(out[1]),
np.mean(out[2]), start_time - prev_start_time))
sys.stdout.flush()
if i == num_iterations:
break
final_weight_tensor_quant_dict[
weight_var_name] = adaround.update_final_weights()
if bias_correction:
final_weight_tensor_quant_dict[weight_var_name] = bias_correction_w(
weight_var_tensor,
final_weight_tensor_quant_dict[weight_var_name],
scale,
adaround.quant_axis,
weight_bits=adaround.weight_bits)
del adaround
# update adarounded calibrated weights
......
......@@ -36,8 +36,9 @@ from . import utils
__all__ = ['PostTrainingQuantization', 'WeightQuantization']
_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
_logger = get_logger(__name__,
logging.INFO,
fmt='%(asctime)s-%(levelname)s: %(message)s')
def _all_persistable_var_names(program):
......@@ -88,7 +89,8 @@ def _apply_pass(scope,
cpp_graph.set_not_owned('__param_scope__', scope)
if attrs:
assert attr_values and len(attrs) == len(
attr_values), "Different number of pass attributes and their values."
attr_values
), "Different number of pass attributes and their values."
for attr, value in zip(attrs, attr_values):
ir_pass.set(attr, value)
ir_pass.apply(cpp_graph)
......@@ -180,7 +182,8 @@ class PostTrainingQuantization(object):
"mul"].
round_type(str, optional): The method of converting the quantized weights
value float->int. Currently supports ['round', 'adaround'] methods.
Default is `round`, which is rounding nearest to the nearest whole number.
Default is `round`, which is rounding nearest to the integer.
'adaround' is refer to https://arxiv.org/abs/2004.10568.
learning_rate(float, optional): The learning rate of adaround method.
is_full_quantized(bool, optional): If set is_full_quantized as True,
apply quantization to all supported quantizable op type. If set
......@@ -364,7 +367,8 @@ class PostTrainingQuantization(object):
batch_id = 0
with tqdm(
total=self._batch_nums,
bar_format='Preparation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
bar_format=
'Preparation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
ncols=80) as t:
for data in self._data_loader():
self._executor.run(program=self._program,
......@@ -380,9 +384,9 @@ class PostTrainingQuantization(object):
self._init_sampling_act_histogram()
batch_id = 0
with tqdm(
total=self._batch_nums,
bar_format='Sampling stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
with tqdm(total=self._batch_nums,
bar_format=
'Sampling stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
ncols=80) as t:
for data in self._data_loader():
self._executor.run(program=self._program,
......@@ -446,8 +450,7 @@ class PostTrainingQuantization(object):
scale_dict = self._quantized_var_threshold
else:
scale_dict = self._quantized_threshold
run_adaround(
self._data_loader,
run_adaround(self._data_loader,
self._program,
self._fetch_list,
self._executor,
......@@ -457,6 +460,7 @@ class PostTrainingQuantization(object):
self._weight_op_pairs,
scale_dict,
num_iterations=self._batch_nums,
bias_correction=self._bias_correction,
lr=self._learning_rate)
def save_quantized_model(self,
......@@ -478,8 +482,7 @@ class PostTrainingQuantization(object):
None
'''
clip_extra = True if self._onnx_format else False
io.save_inference_model(
dirname=save_model_path,
io.save_inference_model(dirname=save_model_path,
model_filename=model_filename,
params_filename=params_filename,
feeded_var_names=self._feed_list,
......@@ -508,17 +511,18 @@ class PostTrainingQuantization(object):
if self._data_loader is not None:
return
self._data_loader = io.DataLoader.from_generator(
feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True)
self._data_loader = io.DataLoader.from_generator(feed_list=feed_vars,
capacity=3 *
self._batch_size,
iterable=True)
if self._sample_generator is not None:
self._data_loader.set_sample_generator(
self._sample_generator,
self._data_loader.set_sample_generator(self._sample_generator,
batch_size=self._batch_size,
drop_last=True,
places=self._place)
elif self._batch_generator is not None:
self._data_loader.set_batch_generator(
self._batch_generator, places=self._place)
self._data_loader.set_batch_generator(self._batch_generator,
places=self._place)
def _optimize_fp32_model(self):
'''
......@@ -569,11 +573,9 @@ class PostTrainingQuantization(object):
" is not supported for quantization.")
# For quantized ops, sample inputs and outputs
if op_type in self._quantizable_op_type:
collect_var_name(
utils._get_op_input_var_names(op),
collect_var_name(utils._get_op_input_var_names(op),
persistable_var_names, op_type)
collect_var_name(
utils._get_op_output_var_names(op),
collect_var_name(utils._get_op_output_var_names(op),
persistable_var_names, op_type)
# collect quanted op output var name
for out_var_name in utils._get_op_output_var_names(op):
......@@ -583,8 +585,7 @@ class PostTrainingQuantization(object):
in_var_name] = out_var_name
# For other op, only sample output scale
elif op_type in self._out_scale_op_list:
collect_var_name(
utils._get_op_output_var_names(op),
collect_var_name(utils._get_op_output_var_names(op),
persistable_var_names, op_type)
def _set_activation_persistable(self):
......@@ -655,6 +656,11 @@ class PostTrainingQuantization(object):
scale = s * abs_max_value
s += 0.02
bins = 2**(self._activation_bits - 1) - 1
if self._onnx_format:
quant_var = np.clip(np.round(var_tensor / scale * bins),
-bins - 1, bins)
quant_dequant_var = quant_var / bins * scale
else:
quant_dequant_var = np.round(
np.clip(var_tensor, 0.0, scale) / scale *
bins) / bins * scale
......@@ -694,6 +700,11 @@ class PostTrainingQuantization(object):
scale = s * abs_max_value
s += 0.02
bins = 2**(self._activation_bits - 1) - 1
if self._onnx_format:
quant_var = np.clip(np.round(var_tensor / scale * bins),
-bins - 1, bins)
quant_dequant_var = quant_var / bins * scale
else:
quant_dequant_var = np.round(
np.clip(var_tensor, 0.0, scale) / scale *
bins) / bins * scale
......@@ -846,8 +857,9 @@ class PostTrainingQuantization(object):
if var_name not in self._sampling_act_histogram:
min_val = self._sampling_act_abs_min_max[var_name][0]
max_val = self._sampling_act_abs_min_max[var_name][1]
hist, hist_edeges = np.histogram(
[], bins=self._histogram_bins, range=(min_val, max_val))
hist, hist_edeges = np.histogram([],
bins=self._histogram_bins,
range=(min_val, max_val))
self._sampling_act_histogram[var_name] = [hist, hist_edeges]
def _calculate_kl_hist_threshold(self):
......@@ -951,18 +963,11 @@ class PostTrainingQuantization(object):
else:
scale_dict = self._quantized_threshold
for key, val in scale_dict.items():
utils.set_variable_data(
self._scope,
self._place,
key + ".scale",
np.array(
[val], dtype=np.float32))
utils.set_variable_data(
self._scope,
self._place,
utils.set_variable_data(self._scope, self._place, key + ".scale",
np.array([val], dtype=np.float32))
utils.set_variable_data(self._scope, self._place,
key + ".quant_dequant.scale",
np.array(
[val], dtype=np.float32))
np.array([val], dtype=np.float32))
if not self._onnx_format:
# apply QuantizationFreezePass, and obtain the final quant model
......@@ -1038,8 +1043,8 @@ class PostTrainingQuantization(object):
for block_id in range(len(self._program.blocks)):
for op in self._program.blocks[block_id].ops:
if op.type in (
self._quantizable_op_type + self._out_scale_op_list):
if op.type in (self._quantizable_op_type +
self._out_scale_op_list):
out_var_names = utils._get_op_output_var_names(op)
for var_name in out_var_names:
analysis_and_save_info(op, var_name)
......@@ -1175,9 +1180,10 @@ class WeightQuantization(object):
if generate_test_model:
test_model_dir = os.path.join(save_model_dir, "test_model")
self._quantize_weight_to_int(
test_model_dir, save_model_filename, save_params_filename,
quantizable_op_type, weight_bits, weight_quantize_type, True,
self._quantize_weight_to_int(test_model_dir, save_model_filename,
save_params_filename,
quantizable_op_type, weight_bits,
weight_quantize_type, True,
threshold_rate)
def convert_weight_to_fp16(self, save_model_dir):
......@@ -1216,15 +1222,16 @@ class WeightQuantization(object):
if self._params_filename is not None:
save_var_map[new_var.name] = new_var
else:
save_file_path = os.path.join(
os.path.normpath(save_model_dir), new_var.name)
save_block.append_op(
type='save',
save_file_path = os.path.join(os.path.normpath(save_model_dir),
new_var.name)
save_block.append_op(type='save',
inputs={'X': [new_var]},
outputs={},
attrs={
'file_path': os.path.normpath(save_file_path),
'save_as_fp16': True
'file_path':
os.path.normpath(save_file_path),
'save_as_fp16':
True
})
if self._params_filename is not None:
......@@ -1237,14 +1244,15 @@ class WeightQuantization(object):
name=unique_name.generate("saved_params"))
saved_params_var.desc.set_persistable(True)
save_path = os.path.join(
os.path.normpath(save_model_dir), self._params_filename)
save_block.append_op(
type='save_combine',
save_path = os.path.join(os.path.normpath(save_model_dir),
self._params_filename)
save_block.append_op(type='save_combine',
inputs={'X': save_var_list},
outputs={'Y': saved_params_var},
attrs={'file_path': save_path,
'save_as_fp16': True})
attrs={
'file_path': save_path,
'save_as_fp16': True
})
save_program._sync_with_cpp()
exe.run(save_program)
......@@ -1293,8 +1301,7 @@ class WeightQuantization(object):
self._weight_channel_wise_abs_max_quantization(
scope, place, weight_bits, op, var_name, for_test)
io.save_inference_model(
dirname=save_model_dir,
io.save_inference_model(dirname=save_model_dir,
feeded_var_names=feed_list,
target_vars=fetch_list,
executor=exe,
......@@ -1339,8 +1346,9 @@ class WeightQuantization(object):
op._set_attr(var_name + "_quant_scale", [scale]) # Save as list
op._set_attr("with_quant_attr", True)
def _weight_channel_wise_abs_max_quantization(
self, scope, place, weight_bits, op, var_name, for_test):
def _weight_channel_wise_abs_max_quantization(self, scope, place,
weight_bits, op, var_name,
for_test):
'''
Use channel_wise_abs_max method to quantize weight.
'''
......@@ -1390,8 +1398,8 @@ class WeightQuantization(object):
and quantize the weights.
'''
scales = []
quantized_weight_data = np.zeros_like(
weight_data, dtype=save_weight_dtype)
quantized_weight_data = np.zeros_like(weight_data,
dtype=save_weight_dtype)
channel_num = weight_data.shape[0]
for i in range(channel_num):
scale = np.max(np.abs(weight_data[i])) / quantize_range
......@@ -1404,8 +1412,8 @@ class WeightQuantization(object):
'''
For conv2d and depthwise_conv2d, dequantize the weights to fp32.
'''
dequantized_weight_data = np.zeros_like(
quantized_weight_data, dtype=np.float32)
dequantized_weight_data = np.zeros_like(quantized_weight_data,
dtype=np.float32)
for i in range(len(scales)):
dequantized_weight_data[i] = \
(quantized_weight_data[i] * scales[i]).astype(np.float32)
......@@ -1418,8 +1426,8 @@ class WeightQuantization(object):
and quantize the weights.
'''
scales = []
quantized_weight_data = np.zeros_like(
weight_data, dtype=save_weight_dtype)
quantized_weight_data = np.zeros_like(weight_data,
dtype=save_weight_dtype)
channel_num = weight_data.shape[-1]
for i in range(channel_num):
scale = np.max(np.abs(weight_data[:, i])) / quantize_range
......@@ -1432,8 +1440,8 @@ class WeightQuantization(object):
'''
For mul, dequantize the weights to fp32.
'''
dequantized_weight_data = np.zeros_like(
quantized_weight_data, dtype=np.float32)
dequantized_weight_data = np.zeros_like(quantized_weight_data,
dtype=np.float32)
for i in range(len(scales)):
dequantized_weight_data[:, i] = \
(quantized_weight_data[:, i] * scales[i]).astype(np.float32)
......@@ -1441,8 +1449,9 @@ class WeightQuantization(object):
def _calculate_threshold(self, input, threshold_rate, histogram_bins=5000):
input_abs = np.abs(input)
hist, hist_edeges = np.histogram(
input_abs, bins=histogram_bins, range=(0, np.max(input_abs)))
hist, hist_edeges = np.histogram(input_abs,
bins=histogram_bins,
range=(0, np.max(input_abs)))
hist = hist / float(sum(hist))
hist_sum = 0
hist_index = 0
......
......@@ -321,7 +321,7 @@ def set_variable_data(scope, place, var_name, np_value):
tensor.set(np_value, place)
def quant_tensor(x, scale, quant_axis=0, weight_bits=8):
def quant_tensor(x, scale, quant_axis=0, weight_bits=8, onnx_format=False):
# symmetry quant
def _clip(x, scale):
x[x > scale] = scale
......@@ -335,13 +335,25 @@ def quant_tensor(x, scale, quant_axis=0, weight_bits=8):
if s == 0.0:
s = 1e-8
if quant_axis == 0:
if onnx_format:
x[i] = np.round(x[i] / s * bnt)
x[i] = np.clip(x[i], -bnt - 1, bnt)
else:
x[i] = _clip(x[i], s)
x[i] = x[i] / s * bnt
else:
if onnx_format:
x[:, i] = np.round(x[:, i] / s * bnt)
x[:, i] = np.clip(x[:, i], -bnt - 1, bnt)
else:
x[:, i] = _clip(x[:, i], s)
x[:, i] = x[:, i] / s * bnt
else:
scale = 1e-8 if scale == 0.0 else scale
if onnx_format:
x = np.round(x / scale * bnt)
x = np.clip(x, -bnt - 1, bnt)
else:
x = _clip(x, scale)
x = x / scale * bnt
return x
......@@ -416,6 +428,7 @@ def calculate_quant_cos_error(orig_tensor, qdq_tensor):
class tqdm(object):
def __init__(self, total, bar_format='Loading|{bar}', ncols=80):
self.total = total
self.bar_format = bar_format
......
......@@ -35,8 +35,9 @@ from paddle.fluid.framework import _test_eager_guard
from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
from imperative_test_utils import ImperativeLinearBn_hook
_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
_logger = get_logger(__name__,
logging.INFO,
fmt='%(asctime)s-%(levelname)s: %(message)s')
class TestFuseLinearBn(unittest.TestCase):
......@@ -55,15 +56,15 @@ class TestFuseLinearBn(unittest.TestCase):
quant_h = ptq.quantize(model_h, fuse=True, fuse_list=f_l)
for name, layer in quant_model.named_sublayers():
if name in f_l:
assert not (isinstance(layer, nn.BatchNorm1D) or
isinstance(layer, nn.BatchNorm2D))
assert not (isinstance(layer, nn.BatchNorm1D)
or isinstance(layer, nn.BatchNorm2D))
out = model(inputs)
out_h = model_h(inputs)
out_quant = quant_model(inputs)
out_quant_h = quant_h(inputs)
cos_sim_func = nn.CosineSimilarity(axis=0)
print('fuse linear+bn',
cos_sim_func(out.flatten(), out_quant.flatten()))
print('fuse linear+bn', cos_sim_func(out.flatten(),
out_quant.flatten()))
print(cos_sim_func(out_h.flatten(), out_quant_h.flatten()))
......@@ -87,8 +88,8 @@ class TestImperativePTQ(unittest.TestCase):
def cache_unzipping(self, target_folder, zip_path):
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
zip_path)
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
target_folder, zip_path)
os.system(cmd)
def download_model(self, data_url, data_md5, folder_name):
......@@ -123,8 +124,8 @@ class TestImperativePTQ(unittest.TestCase):
def model_test(self, model, batch_num=-1, batch_size=8):
model.eval()
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(paddle.dataset.mnist.test(),
batch_size=batch_size)
eval_acc_top1_list = []
for batch_id, data in enumerate(test_reader()):
......@@ -157,8 +158,8 @@ class TestImperativePTQ(unittest.TestCase):
[inference_program, feed_target_names, fetch_targets
] = (paddle.static.load_inference_model(program_path, exe))
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=batch_size)
test_reader = paddle.batch(paddle.dataset.mnist.test(),
batch_size=batch_size)
top1_correct_num = 0.
total_num = 0.
......@@ -203,13 +204,13 @@ class TestImperativePTQ(unittest.TestCase):
self.batch_size)
input_spec = [
paddle.static.InputSpec(
shape=[None, 1, 28, 28], dtype='float32')
paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
]
with tempfile.TemporaryDirectory(prefix="imperative_ptq_") as tmpdir:
save_path = os.path.join(tmpdir, "model")
self.ptq.save_quantized_model(
model=quant_model, path=save_path, input_spec=input_spec)
self.ptq.save_quantized_model(model=quant_model,
path=save_path,
input_spec=input_spec)
print('Quantized model saved in {%s}' % save_path)
after_acc_top1 = self.model_test(quant_model, self.batch_num,
......@@ -225,12 +226,10 @@ class TestImperativePTQ(unittest.TestCase):
print('After converted acc_top1: %s' % after_acc_top1)
print('Infer acc_top1: %s' % infer_acc_top1)
self.assertTrue(
after_acc_top1 >= self.eval_acc_top1,
self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
msg="The test acc {%f} is less than {%f}." %
(after_acc_top1, self.eval_acc_top1))
self.assertTrue(
infer_acc_top1 >= after_acc_top1,
self.assertTrue(infer_acc_top1 >= after_acc_top1,
msg='The acc is lower after converting model.')
end_time = time.time()
......@@ -243,6 +242,7 @@ class TestImperativePTQ(unittest.TestCase):
class TestImperativePTQfuse(TestImperativePTQ):
def func_ptq(self):
start_time = time.time()
......@@ -261,19 +261,19 @@ class TestImperativePTQfuse(TestImperativePTQ):
quant_model = self.ptq.quantize(model, fuse=True, fuse_list=f_l)
for name, layer in quant_model.named_sublayers():
if name in f_l:
assert not (isinstance(layer, nn.BatchNorm1D) or
isinstance(layer, nn.BatchNorm2D))
assert not (isinstance(layer, nn.BatchNorm1D)
or isinstance(layer, nn.BatchNorm2D))
before_acc_top1 = self.model_test(quant_model, self.batch_num,
self.batch_size)
input_spec = [
paddle.static.InputSpec(
shape=[None, 1, 28, 28], dtype='float32')
paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
]
with tempfile.TemporaryDirectory(prefix="imperative_ptq_") as tmpdir:
save_path = os.path.join(tmpdir, "model")
self.ptq.save_quantized_model(
model=quant_model, path=save_path, input_spec=input_spec)
self.ptq.save_quantized_model(model=quant_model,
path=save_path,
input_spec=input_spec)
print('Quantized model saved in {%s}' % save_path)
after_acc_top1 = self.model_test(quant_model, self.batch_num,
......@@ -291,14 +291,12 @@ class TestImperativePTQfuse(TestImperativePTQ):
#Check whether the quant_model is correct after converting.
#The acc of quantized model should be higher than 0.95.
self.assertTrue(
after_acc_top1 >= self.eval_acc_top1,
self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
msg="The test acc {%f} is less than {%f}." %
(after_acc_top1, self.eval_acc_top1))
#Check the saved infer_model.The acc of infer model
#should not be lower than the one of dygraph model.
self.assertTrue(
infer_acc_top1 >= after_acc_top1,
self.assertTrue(infer_acc_top1 >= after_acc_top1,
msg='The acc is lower after converting model.')
end_time = time.time()
......@@ -311,6 +309,7 @@ class TestImperativePTQfuse(TestImperativePTQ):
class TestImperativePTQHist(TestImperativePTQ):
def set_vars(self):
config = PTQConfig(HistQuantizer(), AbsmaxQuantizer())
self.ptq = ImperativePTQ(config)
......@@ -332,13 +331,14 @@ class TestImperativePTQHist(TestImperativePTQ):
class TestImperativePTQKL(TestImperativePTQ):
def set_vars(self):
config = PTQConfig(KLQuantizer(), PerChannelAbsmaxQuantizer())
self.ptq = ImperativePTQ(config)
self.batch_num = 10
self.batch_size = 10
self.eval_acc_top1 = 1.0
self.eval_acc_top1 = 0.98
conv2d_1_wt_thresholds = [
0.18116560578346252, 0.17079241573810577, 0.1702047884464264,
......
......@@ -34,6 +34,7 @@ np.random.seed(0)
class TestPostTrainingQuantization(unittest.TestCase):
def setUp(self):
self.download_path = 'int8/download'
self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
......@@ -44,8 +45,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
try:
os.system("mkdir -p " + self.int8_model_path)
except Exception as e:
print("Failed to create {} due to {}".format(self.int8_model_path,
str(e)))
print("Failed to create {} due to {}".format(
self.int8_model_path, str(e)))
sys.exit(-1)
def tearDown(self):
......@@ -53,8 +54,8 @@ class TestPostTrainingQuantization(unittest.TestCase):
def cache_unzipping(self, target_folder, zip_path):
if not os.path.exists(target_folder):
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
zip_path)
cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
target_folder, zip_path)
os.system(cmd)
def download_model(self, data_url, data_md5, folder_name):
......@@ -68,6 +69,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
return data_cache_folder
def get_batch_reader(self, data_path, place):
def reader():
with open(data_path, 'rb') as in_file:
while True:
......@@ -80,15 +82,14 @@ class TestPostTrainingQuantization(unittest.TestCase):
seq_len = (alllen >> 16) & 0xFFFF
label = in_file.read(4 * label_len)
label = np.frombuffer(
label, dtype=np.int32).reshape([len(label) // 4])
label = np.frombuffer(label, dtype=np.int32).reshape(
[len(label) // 4])
if label.shape[0] != 1 or label[0] > 6350:
continue
feat = in_file.read(4 * seq_len * 8)
feat = np.frombuffer(
feat,
dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
feat = np.frombuffer(feat, dtype=np.float32).reshape(
[len(feat) // 4 // 8, 8])
lod_feat = [feat.shape[0]]
minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
......@@ -97,6 +98,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
return reader
def get_simple_reader(self, data_path, place):
def reader():
with open(data_path, 'rb') as in_file:
while True:
......@@ -109,15 +111,14 @@ class TestPostTrainingQuantization(unittest.TestCase):
seq_len = (alllen >> 16) & 0xFFFF
label = in_file.read(4 * label_len)
label = np.frombuffer(
label, dtype=np.int32).reshape([len(label) // 4])
label = np.frombuffer(label, dtype=np.int32).reshape(
[len(label) // 4])
if label.shape[0] != 1 or label[0] > 6350:
continue
feat = in_file.read(4 * seq_len * 8)
feat = np.frombuffer(
feat,
dtype=np.float32).reshape([len(feat) // 4 // 8, 8])
feat = np.frombuffer(feat, dtype=np.float32).reshape(
[len(feat) // 4 // 8, 8])
lod_feat = [feat.shape[0]]
minputs = fluid.create_lod_tensor(feat, [lod_feat], place)
......@@ -178,8 +179,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
scope = fluid.global_scope()
batch_generator = self.get_batch_reader(data_path, place)
ptq = PostTrainingQuantization(
executor=exe,
ptq = PostTrainingQuantization(executor=exe,
model_dir=model_path,
batch_generator=batch_generator,
batch_nums=batch_nums,
......@@ -223,10 +223,11 @@ class TestPostTrainingQuantization(unittest.TestCase):
print("Start post training quantization for {0} on {1} samples ...".
format(model_name, quant_iterations))
self.generate_quantized_model(
fp32_model_path, data_path, algo, round_type, quantizable_op_type,
is_full_quantize, is_use_cache_file, is_optimize_model,
quant_iterations, onnx_format)
self.generate_quantized_model(fp32_model_path, data_path, algo,
round_type, quantizable_op_type,
is_full_quantize, is_use_cache_file,
is_optimize_model, quant_iterations,
onnx_format)
print("Start INT8 inference for {0} on {1} samples ...".format(
model_name, infer_iterations))
......@@ -245,6 +246,7 @@ class TestPostTrainingQuantization(unittest.TestCase):
class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
def test_post_training_avg(self):
model_name = "nlp_lstm_fp32_model"
model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
......@@ -268,6 +270,7 @@ class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
def test_post_training_avg_onnx_format(self):
model_name = "nlp_lstm_fp32_model"
model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
......@@ -285,8 +288,7 @@ class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
infer_iterations = 100
quant_iterations = 10
onnx_format = True
self.run_test(
model_name,
self.run_test(model_name,
model_url,
model_md5,
data_name,
......
......@@ -21,6 +21,7 @@ paddle.enable_static()
class TestPostTrainingForResnet50(TestPostTrainingQuantization):
def test_post_training_resnet50(self):
model = "ResNet-50"
algo = "min_max"
......@@ -40,6 +41,7 @@ class TestPostTrainingForResnet50(TestPostTrainingQuantization):
class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
def test_post_training_resnet50(self):
model = "ResNet-50"
algo = "min_max"
......@@ -54,8 +56,7 @@ class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingQuantization):
is_optimize_model = False
diff_threshold = 0.025
onnx_format = True
self.run_test(
model,
self.run_test(model,
algo,
round_type,
data_urls,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册