未验证 提交 7debae3a 编写于 作者: J joanna.wozna.intel 提交者: GitHub

Add fusion_gru and multi_gru to PTQ (Post-Training Quantization) (#33749)

* Add calculation for gru op

* Correct the types

* Remove mkldnn only

* Correct mkldnn ifdef

* Remove mkldnn ifdef

* Separate mkldnn quantizer test

* Correct Windows test

* Check different cmake fix

* Revert cmake change

* Cmake change 2

* Cmake change 3
上级 48bf7cbf
......@@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() {
PDNode *patterns::QuantizePlacement::operator()(
const std::unordered_set<std::string> &quantize_enabled_op_types) {
std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
"fc", "matmul", "pool2d", "prior_box",
"reshape2", "transpose2", "fusion_gru"});
std::unordered_set<std::string>(
{"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d",
"prior_box", "reshape2", "transpose2", "fusion_gru", "multi_gru"});
if (!quantize_enabled_op_types.empty()) {
supported_op_types = quantize_enabled_op_types;
}
......
......@@ -832,7 +832,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern);
if (!AreScalesPresentForNodes({x, weight_h, weight_x})) {
if (!AreScalesPresentForNodes({x, weight_x})) {
LogCannotQuantizeOp(op);
return;
}
......
......@@ -78,7 +78,6 @@ set(SHARED_INFERENCE_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/io_utils.cc
${mkldnn_quantizer_src_file}
${PADDLE_CUSTOM_OP_SRCS})
# shared inference library deps
......
......@@ -20,10 +20,9 @@ endif(APPLE)
add_subdirectory(details)
if(WITH_MKLDNN)
set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
set(mkldnn_quantizer_src_file ${mkldnn_quantizer_src} PARENT_SCOPE)
set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE)
endif()
......@@ -71,6 +70,16 @@ elseif (WIN32)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
if(WITH_TESTING AND WITH_MKLDNN)
if (NOT APPLE AND NOT WIN32)
cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
elseif (WIN32)
cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
endif()
if(WITH_TESTING AND TEST test_api_impl)
if(NOT APPLE)
set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
......
......@@ -22,9 +22,6 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#endif
DEFINE_string(dirname, "", "dirname to tests.");
......@@ -254,242 +251,6 @@ TEST(AnalysisPredictor, memory_optim) {
}
*/
#ifdef PADDLE_WITH_MKLDNN
class MkldnnQuantizerTest : public testing::Test {
public:
MkldnnQuantizerTest() {
AnalysisConfig config(FLAGS_dirname);
predictor = std::move(CreatePaddlePredictor(config));
auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
auto qconfig = new MkldnnQuantizerConfig();
mkldnn_quantizer.reset(
new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig));
}
std::pair<std::vector<int>, float> Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
int num_bins) const {
return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins);
}
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned);
}
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0);
}
std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned);
}
protected:
std::unique_ptr<PaddlePredictor> predictor;
std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer;
float abs_error = 1e-6;
static const std::array<float, 10> non_negative_values;
static const std::array<float, 10> positive_and_negative_values;
};
const std::array<float, 10> MkldnnQuantizerTest::non_negative_values = {
0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628,
0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741};
const std::array<float, 10> MkldnnQuantizerTest::positive_and_negative_values =
{-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586,
-0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089};
TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) {
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3),
platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) {
// all non-negative values
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
std::vector<int> histogram;
float bin_width;
std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error)
<< "Improperly calculated bin_width.";
ASSERT_EQ(histogram[0], 4);
ASSERT_EQ(histogram[1], 4);
ASSERT_EQ(histogram[2], 2);
}
TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) {
const auto& values = positive_and_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
std::vector<int> histogram;
float bin_width;
std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error)
<< "Improperly calculated bin_width.";
ASSERT_EQ(histogram[0], 3);
ASSERT_EQ(histogram[1], 5);
ASSERT_EQ(histogram[2], 2);
}
TEST_F(MkldnnQuantizerTest, histogram_zero_bins) {
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0),
platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, histogram_empty) {
// empty tensor
ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet);
// zero tensor
framework::LoDTensor var_tensor;
var_tensor.Resize({0});
var_tensor.mutable_data<double>(platform::CPUPlace());
ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) {
const auto& values = positive_and_negative_values;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false);
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0899106152344, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) {
const auto& values = positive_and_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false);
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) {
const auto& values = non_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) {
const auto& values = non_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
int channels = 3;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size()));
for (int i = 0; i < channels; i++)
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()) +
i * values.size());
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), channels);
for (int i = 0; i < channels; i++) {
ASSERT_NEAR(lod_tensor.data<double>()[i], 1.0 / max_val, abs_error);
}
}
TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
const auto& values = non_negative_values;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0252845321362, abs_error);
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
AnalysisConfig config;
......
......@@ -35,6 +35,7 @@ namespace paddle {
using platform::CPUPlace;
using framework::LoDTensor;
using framework::Variable;
using framework::ir::Graph;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
......@@ -44,90 +45,126 @@ using EigenMatrixArray =
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
using ConstEigenMatrixArrayMap = Eigen::Map<const EigenMatrixArray>;
using string::PrettyLogH1;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
static LoDTensor CreateScaleTensor(int64_t channels_num = 1);
static void check_var(const Variable* var, const std::string& var_name) {
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
"%s is not in the scope", var_name));
PADDLE_ENFORCE_EQ(
var->IsType<LoDTensor>(), true,
platform::errors::PreconditionNotMet("Only support lod tensor now."));
}
static void check_tensor(const LoDTensor& tensor) {
PADDLE_ENFORCE_GT(tensor.dims().size(), 0, platform::errors::InvalidArgument(
"Tensor dimension is empty."));
}
void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForGRUWeights(
const paddle::framework::OpDesc* op) {
const auto& wx_names = op->Input("WeightX");
const auto& wh_names = op->Input("WeightH");
for (size_t i = 0; i < wx_names.size(); ++i) {
const auto& wx_name = wx_names[i];
const auto& wh_name = wh_names[i];
auto* wx_var = predictor_.sub_scope_->FindVar(wx_name);
auto* wh_var = predictor_.sub_scope_->FindVar(wh_name);
check_var(wx_var, wx_name);
check_var(wh_var, wh_name);
LoDTensor* wx_tensor = wx_var->GetMutable<LoDTensor>();
LoDTensor* wh_tensor = wh_var->GetMutable<LoDTensor>();
scales_[wx_name] = GetMaxChGRUScalingFactor(*wx_tensor, *wh_tensor);
}
}
void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpInputs(
const paddle::framework::OpDesc* op) {
if (op->Type() == "fusion_gru" || op->Type() == "multi_gru") {
CalculateScalesForGRUWeights(op);
}
for (auto const& input : op->Inputs()) {
for (const auto& var_name : input.second) {
// skip if scale already computed
if (scales_.find(var_name) != scales_.end()) continue;
auto* var = predictor_.sub_scope_->FindVar(var_name);
check_var(var, var_name);
LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
// force unsigned type if already know it
bool is_unsigned = false;
CalculateSingleScale(op->Type(), input.first, var_name, *var_tensor,
is_unsigned);
}
}
}
void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
const paddle::framework::OpDesc* op) {
for (auto const& output : op->Outputs()) {
for (const auto& var_name : output.second) {
// skip if scale already computed
if (scales_.find(var_name) != scales_.end()) continue;
auto* var = predictor_.sub_scope_->FindVar(var_name);
check_var(var, var_name);
LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
// force unsigned type if already know it
bool is_unsigned = false;
bool compute_scale = true;
if (op->Type() == "conv2d" || op->Type() == "fc") {
// output of conv2d with relu must be unsigned
std::string fuse_activation =
op->GetAttrIfExists<std::string>("fuse_activation");
is_unsigned = (fuse_activation == "relu" || fuse_activation == "relu6");
} else if (op->Type() == "relu") {
is_unsigned = true;
} else if (op->Type() == "transpose2" || op->Type() == "reshape2" ||
op->Type() == "pool2d") {
auto input_var_name = op->Input("X")[0];
PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(),
platform::errors::PreconditionNotMet(
"Input scales must be calculated before the "
"output scales to infer if output is unsigned."));
if (scales_.find(input_var_name) != scales_.end()) {
scales_[var_name] = scales_[input_var_name];
}
compute_scale = false;
} else if (op->Type() == "concat") {
// output of ops with unsigned input must be unsigned
is_unsigned = true;
double min_scale = std::numeric_limits<double>::max();
for (auto input_var_name : op->Input("X")) {
PADDLE_ENFORCE_NE(
scales_.find(input_var_name), scales_.end(),
platform::errors::PreconditionNotMet(
"Input scales must be calculated before the "
"output scales to infer if output is unsigned."));
is_unsigned = is_unsigned && scales_[input_var_name].first;
min_scale = std::min(
min_scale, scales_[input_var_name].second.data<double>()[0]);
}
auto scale_tensor = CreateScaleTensor();
scale_tensor.data<double>()[0] = min_scale;
scales_[var_name] = {is_unsigned, scale_tensor};
compute_scale = false;
}
if (compute_scale) {
CalculateSingleScale(op->Type(), output.first, var_name, *var_tensor,
is_unsigned);
}
}
}
}
bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
PrettyLogH1("--- Calculating scales for quantization");
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
if (platform::HasOpINT8DataType(op)) {
const VariableNameMap& connections_in = op->Inputs();
const VariableNameMap& connections_out = op->Outputs();
auto glambda = [&](const VariableNameMap& connections, bool is_output) {
for (auto const& conn : connections) {
for (const auto& var_name : conn.second) {
// skip if scale already computed
if (scales_.find(var_name) != scales_.end()) continue;
auto* var = predictor_.sub_scope_->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var,
platform::errors::PreconditionNotMet(
"%s is not in the scope", var_name));
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
platform::errors::PreconditionNotMet(
"Only support lod tensor now."));
LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
// force unsigned type if already know it
bool is_unsigned = false;
bool compute_scale = true;
if (is_output) {
if (op->Type() == "conv2d" || op->Type() == "fc") {
// output of conv2d with relu must be unsigned
std::string fuse_activation =
op->GetAttrIfExists<std::string>("fuse_activation");
is_unsigned =
(fuse_activation == "relu" || fuse_activation == "relu6");
} else if (op->Type() == "relu") {
is_unsigned = true;
} else if (op->Type() == "transpose2" ||
op->Type() == "reshape2" || op->Type() == "pool2d") {
auto input_var_name = op->Input("X")[0];
PADDLE_ENFORCE_NE(
scales_.find(input_var_name), scales_.end(),
platform::errors::PreconditionNotMet(
"Input scales must be calculated before the "
"output scales to infer if output is unsigned."));
if (scales_.find(input_var_name) != scales_.end()) {
scales_[var_name] = scales_[input_var_name];
}
compute_scale = false;
} else if (op->Type() == "concat") {
// output of ops with unsigned input must be unsigned
is_unsigned = true;
double min_scale = std::numeric_limits<double>::max();
for (auto input_var_name : op->Input("X")) {
PADDLE_ENFORCE_NE(
scales_.find(input_var_name), scales_.end(),
platform::errors::PreconditionNotMet(
"Input scales must be calculated before the "
"output scales to infer if output is unsigned."));
is_unsigned = is_unsigned && scales_[input_var_name].first;
min_scale = std::min(
min_scale,
scales_[input_var_name].second.data<double>()[0]);
}
auto scale_tensor = CreateScaleTensor();
scale_tensor.data<double>()[0] = min_scale;
scales_[var_name] = {is_unsigned, scale_tensor};
compute_scale = false;
}
}
if (compute_scale)
CalculateSingleScale(op->Type(), conn.first, var_name,
*var_tensor, is_unsigned);
}
}
};
// handle inputs first to let is_unsigned be inferred for the outputs
glambda(connections_in, false /* is_output */);
glambda(connections_out, true /* is_output */);
CalculateScalesForOpInputs(op);
CalculateScalesForOpOutputs(op);
}
}
return true;
}
......@@ -339,9 +376,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
PADDLE_ENFORCE_GT(
var_tensor.dims().size(), 0,
platform::errors::InvalidArgument("Tensor dimension is empty."));
check_tensor(var_tensor);
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
......@@ -374,6 +409,61 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxChGRUScalingFactor(
const LoDTensor& wx_tensor, const LoDTensor& wh_tensor) const {
check_tensor(wx_tensor);
check_tensor(wh_tensor);
int OC = wh_tensor.dims()[0];
std::vector<float> scale_ur(2 * OC);
std::vector<float> scale_o(OC);
for (int row_id = 0; row_id < wx_tensor.dims()[0]; row_id++) {
for (int col_id = 0; col_id < 2 * OC; col_id++) {
int idx = (row_id * wx_tensor.dims()[1]) + col_id;
auto abs_value = std::abs(wx_tensor.data<float>()[idx]);
if (row_id == 0) {
scale_ur[col_id] = abs_value;
} else {
if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value;
}
}
}
for (int i = 0; i < 2 * OC * OC; i++) {
int col_id = i % (2 * OC);
auto abs_value = std::abs(wh_tensor.data<float>()[i]);
if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value;
}
for (int row_id = 0; row_id < wx_tensor.dims()[0]; row_id++) {
for (int col_id = 2 * OC; col_id < wx_tensor.dims()[1]; col_id++) {
int idx = (row_id * wx_tensor.dims()[1]) + col_id;
auto abs_value = std::abs(wx_tensor.data<float>()[idx]);
if (row_id == 0) {
scale_o[col_id % OC] = abs_value;
} else {
if (abs_value > scale_o[col_id]) scale_o[col_id % OC] = abs_value;
}
}
}
for (int i = 2 * OC * OC; i < OC * wh_tensor.dims()[1]; i++) {
int col_id = i % OC;
auto abs_value = std::abs(wh_tensor.data<float>()[i]);
if (abs_value > scale_o[col_id]) scale_o[col_id] = abs_value;
}
scale_ur.insert(scale_ur.end(), scale_o.begin(), scale_o.end());
transform(scale_ur.begin(), scale_ur.end(), scale_ur.begin(),
[](float& c) { return 1 / c; });
LoDTensor scale_tensor = CreateScaleTensor(scale_ur.size());
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
std::copy(scale_ur.begin(), scale_ur.end(), scale_ptr);
bool is_unsigned = false;
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<std::vector<int>, float>
AnalysisPredictor::MkldnnQuantizer::Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
......
......@@ -52,7 +52,7 @@ class AnalysisPredictor::MkldnnQuantizer {
// Execute full quantization procedure.
bool Quantize();
#if PADDLE_WITH_TESTING
#ifdef PADDLE_WITH_TESTING
friend class MkldnnQuantizerTest;
#endif
......@@ -67,6 +67,11 @@ class AnalysisPredictor::MkldnnQuantizer {
const std::string& var_name,
const framework::LoDTensor& var_tensor,
bool is_unsigned);
void CalculateSingleGRUWeightsScale(const std::string& var_name,
const framework::LoDTensor& var_tensor);
void CalculateScalesForGRUWeights(const paddle::framework::OpDesc* op);
void CalculateScalesForOpOutputs(const paddle::framework::OpDesc* op);
void CalculateScalesForOpInputs(const paddle::framework::OpDesc* op);
void PrepareArgument() const;
void ClearDeviceContext() const;
bool RunQuantizePasses() const;
......@@ -82,6 +87,10 @@ class AnalysisPredictor::MkldnnQuantizer {
const framework::LoDTensor& var_tensor, bool is_unsigned,
bool is_transposed) const;
std::pair<bool, framework::LoDTensor> GetMaxChGRUScalingFactor(
const framework::LoDTensor& wx_tensor,
const framework::LoDTensor& wh_tensor) const;
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
......
......@@ -55,6 +55,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
rules_["elementwise_add"]["Y"] = ScaleAlgo::KL;
rules_["elementwise_add"]["Out"] = ScaleAlgo::KL;
rules_["elementwise_mul"]["X"] = ScaleAlgo::KL;
rules_["elementwise_mul"]["Y"] = ScaleAlgo::KL;
rules_["elementwise_mul"]["Out"] = ScaleAlgo::KL;
// Reshape2 does not perform calculation on the data and shapes are not
// changed. Scale is calculated on input data and assign to Quantize and
// Dequantize scale.
......@@ -63,6 +67,24 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
rules_["reshape2"]["ShapeTensor"] = ScaleAlgo::NONE;
rules_["reshape2"]["XShape"] = ScaleAlgo::NONE;
rules_["reshape2"]["Out"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["X"] = ScaleAlgo::KL;
rules_["fusion_gru"]["H0"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["Bias"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["WeightX"] = ScaleAlgo::NONE; // Weights will be handled
rules_["fusion_gru"]["WeightH"] = ScaleAlgo::NONE; // separately
rules_["fusion_gru"]["ReorderedH0"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["XX"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["BatchedInput"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["BatchedOut"] = ScaleAlgo::NONE;
rules_["fusion_gru"]["Hidden"] = ScaleAlgo::KL;
rules_["multi_gru"]["X"] = ScaleAlgo::KL;
rules_["multi_gru"]["Bias"] = ScaleAlgo::NONE;
rules_["multi_gru"]["WeightX"] = ScaleAlgo::NONE; // Weights will be handled
rules_["multi_gru"]["WeightH"] = ScaleAlgo::NONE; // separately
rules_["multi_gru"]["Scale_weights"] = ScaleAlgo::NONE;
rules_["multi_gru"]["Hidden"] = ScaleAlgo::KL;
}
ScaleAlgo MkldnnQuantizerConfig::scale_algo(
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
DEFINE_string(dirname, "", "dirname to tests.");
namespace paddle {
class MkldnnQuantizerTest : public testing::Test {
public:
MkldnnQuantizerTest() {
AnalysisConfig config(FLAGS_dirname);
predictor = std::move(CreatePaddlePredictor(config));
auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
auto qconfig = new MkldnnQuantizerConfig();
mkldnn_quantizer.reset(
new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig));
}
std::pair<std::vector<int>, float> Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
int num_bins) const {
return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins);
}
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned);
}
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0);
}
std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned);
}
std::pair<bool, framework::LoDTensor> GetMaxChGRUScalingFactor(
const framework::LoDTensor& wx_tensor,
const framework::LoDTensor& wh_tensor) const {
return mkldnn_quantizer->GetMaxChGRUScalingFactor(wx_tensor, wh_tensor);
}
protected:
std::unique_ptr<PaddlePredictor> predictor;
std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer;
float abs_error = 1e-6;
static const std::array<float, 10> non_negative_values;
static const std::array<float, 10> positive_and_negative_values;
};
const std::array<float, 10> MkldnnQuantizerTest::non_negative_values = {
0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628,
0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741};
const std::array<float, 10> MkldnnQuantizerTest::positive_and_negative_values =
{-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586,
-0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089};
TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) {
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3),
platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) {
// all non-negative values
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
std::vector<int> histogram;
float bin_width;
std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error)
<< "Improperly calculated bin_width.";
ASSERT_EQ(histogram[0], 4);
ASSERT_EQ(histogram[1], 4);
ASSERT_EQ(histogram[2], 2);
}
TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) {
const auto& values = positive_and_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
std::vector<int> histogram;
float bin_width;
std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error)
<< "Improperly calculated bin_width.";
ASSERT_EQ(histogram[0], 3);
ASSERT_EQ(histogram[1], 5);
ASSERT_EQ(histogram[2], 2);
}
TEST_F(MkldnnQuantizerTest, histogram_zero_bins) {
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0),
platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, histogram_empty) {
// empty tensor
ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet);
// zero tensor
framework::LoDTensor var_tensor;
var_tensor.Resize({0});
var_tensor.mutable_data<double>(platform::CPUPlace());
ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) {
const auto& values = positive_and_negative_values;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false);
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0899106152344, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) {
const auto& values = positive_and_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false);
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) {
const auto& values = non_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) {
const auto& values = non_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
int channels = 3;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size()));
for (int i = 0; i < channels; i++)
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()) +
i * values.size());
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), channels);
for (int i = 0; i < channels; i++) {
ASSERT_NEAR(lod_tensor.data<double>()[i], 1.0 / max_val, abs_error);
}
}
TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
const auto& values = non_negative_values;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0252845321362, abs_error);
}
const std::vector<std::vector<float>> wx = {
{0.04347931, -0.5643393, 0.7551297, 0.26713502, 0.8055306, 0.91144973},
{0.01707571, 0.12741385, 0.15419468, 0.66127586, 0.46821925, 0.9665961},
{0.40393898, 0.884427, -0.5853097, 0.5840954, 0.9170512, 0.98245513}};
const std::vector<std::vector<float>> wh = {
{0.42484227, -0.9025513, 0.17087583, 0.8403284, 0.03325734, 0.92331886},
{0.32630175, 0.41691914, 0.99848574, 0.3504407, 0.06707559, 0.62239844}};
TEST_F(MkldnnQuantizerTest, max_ch_gru_scaling_factor) {
framework::LoDTensor wx_tensor, wh_tensor, lod_tensor;
wx_tensor.Resize(framework::make_dim(wx.size(), wx[0].size()));
for (size_t i = 0; i < wx.size(); i++)
std::copy(
begin(wx[i]), end(wx[i]),
wx_tensor.mutable_data<float>(platform::CPUPlace()) + i * wx[0].size());
wh_tensor.Resize(framework::make_dim(wh.size(), wh[0].size()));
for (size_t i = 0; i < wh.size(); i++)
std::copy(
begin(wh[i]), end(wh[i]),
wh_tensor.mutable_data<float>(platform::CPUPlace()) + i * wh[0].size());
bool is_unsigned;
std::tie(is_unsigned, lod_tensor) =
GetMaxChGRUScalingFactor(wx_tensor, wh_tensor);
std::vector<double> scales = {2.35381475, 1.08304947, 1.32427582,
1.19001095, 1.00151656, 1.01785819};
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), static_cast<int64_t>(scales.size()));
for (int64_t i = 0; i < lod_tensor.numel(); i++) {
ASSERT_NEAR(lod_tensor.data<double>()[i], scales[i], abs_error);
}
}
} // namespace paddle
......@@ -45,7 +45,9 @@ enum class ScaleAlgo {
MAX_CH, ///< Find scale based on the max absolute value per output channel
MAX_CH_T, ///< Find scale based on the max absolute value per output channel
///< of a transposed tensor
KL, ///< Find scale based on KL Divergence
MAX_CH_GRU, ///< Find scale based on the max absolute value per output
/// channel for fusion_gru/multi_gru operators
KL, ///< Find scale based on KL Divergence
};
///
......
......@@ -195,6 +195,21 @@ function(inference_analysis_api_lexical_bfloat16_test_run TARGET_NAME test_binar
--iterations=2)
endfunction()
function(inference_analysis_api_lexical_int8_test_run TARGET_NAME test_binary infer_model data_path fuse_multi_gru)
inference_analysis_test_run(${TARGET_NAME}
COMMAND ${test_binary}
ARGS --infer_model=${infer_model}
--infer_data=${data_path}
--batch_size=100
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
--with_accuracy_layer=true
--use_analysis=true
--enable_int8=true
--quantized_accuracy=0.01
--fuse_multi_gru=${fuse_multi_gru}
--iterations=4)
endfunction()
function(preprocess_data2bin_test_run target py_script_source data_dir output_file)
py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source}
ARGS --data_dir=${data_dir}
......@@ -475,6 +490,10 @@ if(WITH_MKLDNN)
inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
# run bfloat16 lexical analysis test
inference_analysis_api_lexical_bfloat16_test_run(test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH})
# run post-training quantization lexical analysis test
inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} false)
# run post-training quantization lexical analysis test with multi_gru fuse
inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8_multi_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} true)
### optimized FP32 vs. Quant INT8 tests
......
......@@ -54,6 +54,51 @@ std::vector<size_t> ReadSentenceLod(std::ifstream &file, size_t offset,
return sentence_lod;
}
std::shared_ptr<std::vector<PaddleTensor>> WarmupData(
const std::vector<std::vector<PaddleTensor>> &test_data,
int num_images = 1) {
int data_size = test_data.size();
PADDLE_ENFORCE_LE(static_cast<size_t>(num_images), data_size,
platform::errors::InvalidArgument(
"The requested quantization warmup data size must be "
"lower or equal to the test data size. But received"
"warmup size is %d and test data size is %d",
num_images, data_size));
int words_shape = test_data[0][0].shape[0];
PaddleTensor words;
words.name = "words";
words.shape = {words_shape, 1};
words.dtype = PaddleDType::INT64;
words.data.Resize(sizeof(int64_t) * words_shape);
int target_shape = test_data[0][1].shape[0];
PaddleTensor targets;
targets.name = "targets";
targets.shape = {target_shape, 1};
targets.dtype = PaddleDType::INT64;
targets.data.Resize(sizeof(int64_t) * target_shape);
for (int i = 0; i < num_images; i++) {
std::copy_n(
static_cast<int64_t *>(test_data[i][0].data.data()) + i * words_shape,
words_shape,
static_cast<int64_t *>(words.data.data()) + i * words_shape);
words.lod = test_data[i][0].lod;
std::copy_n(
static_cast<int64_t *>(test_data[i][1].data.data()) + i * target_shape,
target_shape,
static_cast<int64_t *>(targets.data.data()) + i * target_shape);
targets.lod = test_data[i][1].lod;
}
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
(*warmup_data)[0] = std::move(words);
(*warmup_data)[1] = std::move(targets);
return warmup_data;
}
template <typename T>
class TensorReader {
public:
......@@ -210,7 +255,19 @@ TEST(Analyzer_lexical_test, Analyzer_lexical_analysis) {
if (FLAGS_use_analysis) {
AnalysisConfig analysis_cfg;
SetAnalysisConfig(&analysis_cfg, FLAGS_cpu_num_threads);
if (FLAGS_enable_bf16) analysis_cfg.EnableMkldnnBfloat16();
if (FLAGS_enable_bf16) {
analysis_cfg.EnableMkldnnBfloat16();
} else if (FLAGS_enable_int8) {
if (FLAGS_fuse_multi_gru)
analysis_cfg.pass_builder()->AppendPass("multi_gru_fuse_pass");
std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
WarmupData(input_slots_all);
analysis_cfg.EnableMkldnnQuantizer();
analysis_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
analysis_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(
FLAGS_batch_size);
}
std::vector<double> acc_analysis(3);
acc_analysis = Lexical_Test(input_slots_all, &outputs, &analysis_cfg, true);
for (size_t i = 0; i < acc_analysis.size(); i++) {
......
......@@ -73,6 +73,8 @@ DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup.");
DEFINE_bool(enable_profile, false, "Turn on profiler for fluid");
DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
DEFINE_bool(fuse_multi_gru, false,
"Running the inference program with multi_gru_fuse_pass");
namespace paddle {
namespace inference {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册