diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index b4c94010e480a79b54531a7f3e1bff6723cec2cb..e59d45db8cc622f9eeb507ee7a01986bc5db1e86 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() { PDNode *patterns::QuantizePlacement::operator()( const std::unordered_set &quantize_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set({"concat", "conv2d", "elementwise_add", - "fc", "matmul", "pool2d", "prior_box", - "reshape2", "transpose2", "fusion_gru"}); + std::unordered_set( + {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d", + "prior_box", "reshape2", "transpose2", "fusion_gru", "multi_gru"}); if (!quantize_enabled_op_types.empty()) { supported_op_types = quantize_enabled_op_types; } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index f50cd0a01d204dcce4935d93e3dd45554a654b82..0abee33b2942ada95591c18110d79c0b755fe8ba 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -832,7 +832,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern); GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern); - if (!AreScalesPresentForNodes({x, weight_h, weight_x})) { + if (!AreScalesPresentForNodes({x, weight_x})) { LogCannotQuantizeOp(op); return; } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6567c41ee1fedce441b61eb688e6304f9fd4b1c7..e0cda77d4fe1bbe619697b4f48ca55275e1f5bff 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -78,7 +78,6 @@ set(SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc ${CMAKE_CURRENT_SOURCE_DIR}/utils/io_utils.cc - ${mkldnn_quantizer_src_file} ${PADDLE_CUSTOM_OP_SRCS}) # shared inference library deps diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9e49dea9e674f135cd31a07a113532012769286f..741d4def5c04f28b011ef5e1d80fa1325c21f563 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -20,10 +20,9 @@ endif(APPLE) add_subdirectory(details) if(WITH_MKLDNN) - set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc) set(mkldnn_quantizer_cfg mkldnn_quantizer_config) + set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc) cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) - set(mkldnn_quantizer_src_file ${mkldnn_quantizer_src} PARENT_SCOPE) set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE) endif() @@ -71,6 +70,16 @@ elseif (WIN32) cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() + +if(WITH_TESTING AND WITH_MKLDNN) + if (NOT APPLE AND NOT WIN32) + cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) + elseif (WIN32) + cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS analysis_predictor benchmark ${inference_deps} + ARGS --dirname=${WORD2VEC_MODEL_DIR}) + endif() +endif() + if(WITH_TESTING AND TEST test_api_impl) if(NOT APPLE) set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120) diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 703d65a6fc688cb06ecd7fc16c228518d2fe1261..c14614d68825e4cc7c4d696ab84da38de42bd8a5 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -22,9 +22,6 @@ #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/platform/cpu_info.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/inference/api/mkldnn_quantizer.h" -#endif DEFINE_string(dirname, "", "dirname to tests."); @@ -254,242 +251,6 @@ TEST(AnalysisPredictor, memory_optim) { } */ -#ifdef PADDLE_WITH_MKLDNN -class MkldnnQuantizerTest : public testing::Test { - public: - MkldnnQuantizerTest() { - AnalysisConfig config(FLAGS_dirname); - predictor = std::move(CreatePaddlePredictor(config)); - auto* predictor_p = static_cast(predictor.get()); - - auto qconfig = new MkldnnQuantizerConfig(); - - mkldnn_quantizer.reset( - new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig)); - } - - std::pair, float> Histogram( - const framework::LoDTensor& var_tensor, float min_val, float max_val, - int num_bins) const { - return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins); - } - - std::pair GetMaxScalingFactor( - const framework::LoDTensor& var_tensor, bool is_unsigned) const { - return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned); - } - - std::pair GetMaxChScalingFactor( - const framework::LoDTensor& var_tensor, bool is_unsigned) const { - return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0); - } - - std::pair GetKLScalingFactor( - const framework::LoDTensor& var_tensor, bool is_unsigned) const { - return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned); - } - - protected: - std::unique_ptr predictor; - std::unique_ptr mkldnn_quantizer; - float abs_error = 1e-6; - static const std::array non_negative_values; - static const std::array positive_and_negative_values; -}; - -const std::array MkldnnQuantizerTest::non_negative_values = { - 0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628, - 0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741}; -const std::array MkldnnQuantizerTest::positive_and_negative_values = - {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, - -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; - -TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) { - const auto& values = non_negative_values; - auto min_val = *std::min_element(values.begin(), values.end()); - auto max_val = *std::max_element(values.begin(), values.end()); - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3), - platform::EnforceNotMet); -} - -TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) { - // all non-negative values - const auto& values = non_negative_values; - auto min_val = *std::min_element(values.begin(), values.end()); - auto max_val = *std::max_element(values.begin(), values.end()); - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - std::vector histogram; - float bin_width; - - std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); - - ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error) - << "Improperly calculated bin_width."; - - ASSERT_EQ(histogram[0], 4); - ASSERT_EQ(histogram[1], 4); - ASSERT_EQ(histogram[2], 2); -} - -TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) { - const auto& values = positive_and_negative_values; - auto min_val = *std::min_element(values.begin(), values.end()); - auto max_val = *std::max_element(values.begin(), values.end()); - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - std::vector histogram; - float bin_width; - - std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); - - ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error) - << "Improperly calculated bin_width."; - - ASSERT_EQ(histogram[0], 3); - ASSERT_EQ(histogram[1], 5); - ASSERT_EQ(histogram[2], 2); -} - -TEST_F(MkldnnQuantizerTest, histogram_zero_bins) { - const auto& values = non_negative_values; - auto min_val = *std::min_element(values.begin(), values.end()); - auto max_val = *std::max_element(values.begin(), values.end()); - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0), - platform::EnforceNotMet); -} - -TEST_F(MkldnnQuantizerTest, histogram_empty) { - // empty tensor - ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet); - - // zero tensor - framework::LoDTensor var_tensor; - var_tensor.Resize({0}); - var_tensor.mutable_data(platform::CPUPlace()); - - ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); -} - -TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) { - const auto& values = positive_and_negative_values; - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - bool is_unsigned; - framework::LoDTensor lod_tensor; - - std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false); - - ASSERT_EQ(is_unsigned, false); - ASSERT_EQ(lod_tensor.numel(), 1); - ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0899106152344, abs_error); -} - -TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) { - const auto& values = positive_and_negative_values; - auto max_val = *std::max_element(values.begin(), values.end()); - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - bool is_unsigned; - framework::LoDTensor lod_tensor; - - std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false); - - ASSERT_EQ(is_unsigned, false); - ASSERT_EQ(lod_tensor.numel(), 1); - ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); -} - -TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) { - const auto& values = non_negative_values; - auto max_val = *std::max_element(values.begin(), values.end()); - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - bool is_unsigned; - framework::LoDTensor lod_tensor; - - std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true); - - ASSERT_EQ(is_unsigned, true); - ASSERT_EQ(lod_tensor.numel(), 1); - ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); -} - -TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) { - const auto& values = non_negative_values; - auto max_val = *std::max_element(values.begin(), values.end()); - int channels = 3; - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size())); - for (int i = 0; i < channels; i++) - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace()) + - i * values.size()); - - bool is_unsigned; - framework::LoDTensor lod_tensor; - - std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true); - - ASSERT_EQ(is_unsigned, true); - ASSERT_EQ(lod_tensor.numel(), channels); - for (int i = 0; i < channels; i++) { - ASSERT_NEAR(lod_tensor.data()[i], 1.0 / max_val, abs_error); - } -} - -TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { - const auto& values = non_negative_values; - - framework::LoDTensor var_tensor; - var_tensor.Resize(framework::make_dim(values.size())); - std::copy(begin(values), end(values), - var_tensor.mutable_data(platform::CPUPlace())); - - bool is_unsigned; - framework::LoDTensor lod_tensor; - - std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true); - - ASSERT_EQ(is_unsigned, true); - ASSERT_EQ(lod_tensor.numel(), 1); - ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0252845321362, abs_error); -} -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(AnalysisPredictor, bf16_gpu_pass_strategy) { AnalysisConfig config; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index f6cdbb00b50453d4c4ff7fc06ba82aa042dd194a..b3768dda24c07be4da4c011501dd9f940c5431a2 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -35,6 +35,7 @@ namespace paddle { using platform::CPUPlace; using framework::LoDTensor; +using framework::Variable; using framework::ir::Graph; using ConstEigenVectorArrayMap = Eigen::Map>; @@ -44,90 +45,126 @@ using EigenMatrixArray = Eigen::Array; using ConstEigenMatrixArrayMap = Eigen::Map; using string::PrettyLogH1; +using VariableNameMap = std::map>; static LoDTensor CreateScaleTensor(int64_t channels_num = 1); +static void check_var(const Variable* var, const std::string& var_name) { + PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet( + "%s is not in the scope", var_name)); + PADDLE_ENFORCE_EQ( + var->IsType(), true, + platform::errors::PreconditionNotMet("Only support lod tensor now.")); +} + +static void check_tensor(const LoDTensor& tensor) { + PADDLE_ENFORCE_GT(tensor.dims().size(), 0, platform::errors::InvalidArgument( + "Tensor dimension is empty.")); +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForGRUWeights( + const paddle::framework::OpDesc* op) { + const auto& wx_names = op->Input("WeightX"); + const auto& wh_names = op->Input("WeightH"); + for (size_t i = 0; i < wx_names.size(); ++i) { + const auto& wx_name = wx_names[i]; + const auto& wh_name = wh_names[i]; + auto* wx_var = predictor_.sub_scope_->FindVar(wx_name); + auto* wh_var = predictor_.sub_scope_->FindVar(wh_name); + check_var(wx_var, wx_name); + check_var(wh_var, wh_name); + LoDTensor* wx_tensor = wx_var->GetMutable(); + LoDTensor* wh_tensor = wh_var->GetMutable(); + scales_[wx_name] = GetMaxChGRUScalingFactor(*wx_tensor, *wh_tensor); + } +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpInputs( + const paddle::framework::OpDesc* op) { + if (op->Type() == "fusion_gru" || op->Type() == "multi_gru") { + CalculateScalesForGRUWeights(op); + } + for (auto const& input : op->Inputs()) { + for (const auto& var_name : input.second) { + // skip if scale already computed + if (scales_.find(var_name) != scales_.end()) continue; + auto* var = predictor_.sub_scope_->FindVar(var_name); + check_var(var, var_name); + LoDTensor* var_tensor = var->GetMutable(); + // force unsigned type if already know it + bool is_unsigned = false; + CalculateSingleScale(op->Type(), input.first, var_name, *var_tensor, + is_unsigned); + } + } +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs( + const paddle::framework::OpDesc* op) { + for (auto const& output : op->Outputs()) { + for (const auto& var_name : output.second) { + // skip if scale already computed + if (scales_.find(var_name) != scales_.end()) continue; + auto* var = predictor_.sub_scope_->FindVar(var_name); + check_var(var, var_name); + LoDTensor* var_tensor = var->GetMutable(); + // force unsigned type if already know it + bool is_unsigned = false; + bool compute_scale = true; + if (op->Type() == "conv2d" || op->Type() == "fc") { + // output of conv2d with relu must be unsigned + std::string fuse_activation = + op->GetAttrIfExists("fuse_activation"); + is_unsigned = (fuse_activation == "relu" || fuse_activation == "relu6"); + } else if (op->Type() == "relu") { + is_unsigned = true; + } else if (op->Type() == "transpose2" || op->Type() == "reshape2" || + op->Type() == "pool2d") { + auto input_var_name = op->Input("X")[0]; + PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(), + platform::errors::PreconditionNotMet( + "Input scales must be calculated before the " + "output scales to infer if output is unsigned.")); + if (scales_.find(input_var_name) != scales_.end()) { + scales_[var_name] = scales_[input_var_name]; + } + compute_scale = false; + } else if (op->Type() == "concat") { + // output of ops with unsigned input must be unsigned + is_unsigned = true; + double min_scale = std::numeric_limits::max(); + for (auto input_var_name : op->Input("X")) { + PADDLE_ENFORCE_NE( + scales_.find(input_var_name), scales_.end(), + platform::errors::PreconditionNotMet( + "Input scales must be calculated before the " + "output scales to infer if output is unsigned.")); + is_unsigned = is_unsigned && scales_[input_var_name].first; + min_scale = std::min( + min_scale, scales_[input_var_name].second.data()[0]); + } + auto scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = min_scale; + scales_[var_name] = {is_unsigned, scale_tensor}; + compute_scale = false; + } + if (compute_scale) { + CalculateSingleScale(op->Type(), output.first, var_name, *var_tensor, + is_unsigned); + } + } + } +} + bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { PrettyLogH1("--- Calculating scales for quantization"); - using VariableNameMap = std::map>; std::map> gathered_data; for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) { if (platform::HasOpINT8DataType(op)) { - const VariableNameMap& connections_in = op->Inputs(); - const VariableNameMap& connections_out = op->Outputs(); - - auto glambda = [&](const VariableNameMap& connections, bool is_output) { - for (auto const& conn : connections) { - for (const auto& var_name : conn.second) { - // skip if scale already computed - if (scales_.find(var_name) != scales_.end()) continue; - - auto* var = predictor_.sub_scope_->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, - platform::errors::PreconditionNotMet( - "%s is not in the scope", var_name)); - PADDLE_ENFORCE_EQ(var->IsType(), true, - platform::errors::PreconditionNotMet( - "Only support lod tensor now.")); - LoDTensor* var_tensor = var->GetMutable(); - - // force unsigned type if already know it - bool is_unsigned = false; - bool compute_scale = true; - if (is_output) { - if (op->Type() == "conv2d" || op->Type() == "fc") { - // output of conv2d with relu must be unsigned - std::string fuse_activation = - op->GetAttrIfExists("fuse_activation"); - is_unsigned = - (fuse_activation == "relu" || fuse_activation == "relu6"); - } else if (op->Type() == "relu") { - is_unsigned = true; - } else if (op->Type() == "transpose2" || - op->Type() == "reshape2" || op->Type() == "pool2d") { - auto input_var_name = op->Input("X")[0]; - PADDLE_ENFORCE_NE( - scales_.find(input_var_name), scales_.end(), - platform::errors::PreconditionNotMet( - "Input scales must be calculated before the " - "output scales to infer if output is unsigned.")); - if (scales_.find(input_var_name) != scales_.end()) { - scales_[var_name] = scales_[input_var_name]; - } - compute_scale = false; - } else if (op->Type() == "concat") { - // output of ops with unsigned input must be unsigned - is_unsigned = true; - double min_scale = std::numeric_limits::max(); - for (auto input_var_name : op->Input("X")) { - PADDLE_ENFORCE_NE( - scales_.find(input_var_name), scales_.end(), - platform::errors::PreconditionNotMet( - "Input scales must be calculated before the " - "output scales to infer if output is unsigned.")); - is_unsigned = is_unsigned && scales_[input_var_name].first; - min_scale = std::min( - min_scale, - scales_[input_var_name].second.data()[0]); - } - auto scale_tensor = CreateScaleTensor(); - scale_tensor.data()[0] = min_scale; - scales_[var_name] = {is_unsigned, scale_tensor}; - compute_scale = false; - } - } - if (compute_scale) - CalculateSingleScale(op->Type(), conn.first, var_name, - *var_tensor, is_unsigned); - } - } - }; - // handle inputs first to let is_unsigned be inferred for the outputs - glambda(connections_in, false /* is_output */); - glambda(connections_out, true /* is_output */); + CalculateScalesForOpInputs(op); + CalculateScalesForOpOutputs(op); } } - return true; } @@ -339,9 +376,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( std::pair AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const { - PADDLE_ENFORCE_GT( - var_tensor.dims().size(), 0, - platform::errors::InvalidArgument("Tensor dimension is empty.")); + check_tensor(var_tensor); ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), var_tensor.numel(), 1}; @@ -374,6 +409,61 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( return std::make_pair(is_unsigned, scale_tensor); } +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxChGRUScalingFactor( + const LoDTensor& wx_tensor, const LoDTensor& wh_tensor) const { + check_tensor(wx_tensor); + check_tensor(wh_tensor); + + int OC = wh_tensor.dims()[0]; + std::vector scale_ur(2 * OC); + std::vector scale_o(OC); + + for (int row_id = 0; row_id < wx_tensor.dims()[0]; row_id++) { + for (int col_id = 0; col_id < 2 * OC; col_id++) { + int idx = (row_id * wx_tensor.dims()[1]) + col_id; + auto abs_value = std::abs(wx_tensor.data()[idx]); + if (row_id == 0) { + scale_ur[col_id] = abs_value; + } else { + if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value; + } + } + } + + for (int i = 0; i < 2 * OC * OC; i++) { + int col_id = i % (2 * OC); + auto abs_value = std::abs(wh_tensor.data()[i]); + if (abs_value > scale_ur[col_id]) scale_ur[col_id] = abs_value; + } + + for (int row_id = 0; row_id < wx_tensor.dims()[0]; row_id++) { + for (int col_id = 2 * OC; col_id < wx_tensor.dims()[1]; col_id++) { + int idx = (row_id * wx_tensor.dims()[1]) + col_id; + auto abs_value = std::abs(wx_tensor.data()[idx]); + if (row_id == 0) { + scale_o[col_id % OC] = abs_value; + } else { + if (abs_value > scale_o[col_id]) scale_o[col_id % OC] = abs_value; + } + } + } + + for (int i = 2 * OC * OC; i < OC * wh_tensor.dims()[1]; i++) { + int col_id = i % OC; + auto abs_value = std::abs(wh_tensor.data()[i]); + if (abs_value > scale_o[col_id]) scale_o[col_id] = abs_value; + } + scale_ur.insert(scale_ur.end(), scale_o.begin(), scale_o.end()); + transform(scale_ur.begin(), scale_ur.end(), scale_ur.begin(), + [](float& c) { return 1 / c; }); + LoDTensor scale_tensor = CreateScaleTensor(scale_ur.size()); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + std::copy(scale_ur.begin(), scale_ur.end(), scale_ptr); + bool is_unsigned = false; + return std::make_pair(is_unsigned, scale_tensor); +} + std::pair, float> AnalysisPredictor::MkldnnQuantizer::Histogram( const framework::LoDTensor& var_tensor, float min_val, float max_val, diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h index eeaba7952902b04e83cc5b6c890ecf510b914c65..c41b9d08f676227ef8330880fc01b74a86aa05f1 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.h +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -52,7 +52,7 @@ class AnalysisPredictor::MkldnnQuantizer { // Execute full quantization procedure. bool Quantize(); -#if PADDLE_WITH_TESTING +#ifdef PADDLE_WITH_TESTING friend class MkldnnQuantizerTest; #endif @@ -67,6 +67,11 @@ class AnalysisPredictor::MkldnnQuantizer { const std::string& var_name, const framework::LoDTensor& var_tensor, bool is_unsigned); + void CalculateSingleGRUWeightsScale(const std::string& var_name, + const framework::LoDTensor& var_tensor); + void CalculateScalesForGRUWeights(const paddle::framework::OpDesc* op); + void CalculateScalesForOpOutputs(const paddle::framework::OpDesc* op); + void CalculateScalesForOpInputs(const paddle::framework::OpDesc* op); void PrepareArgument() const; void ClearDeviceContext() const; bool RunQuantizePasses() const; @@ -82,6 +87,10 @@ class AnalysisPredictor::MkldnnQuantizer { const framework::LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const; + std::pair GetMaxChGRUScalingFactor( + const framework::LoDTensor& wx_tensor, + const framework::LoDTensor& wh_tensor) const; + std::pair GetMaxScalingFactor( const framework::LoDTensor& var_tensor, bool is_unsigned) const; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc index 30c6c21ec87f36cdfe0f4ef7950236763c13191a..245bee57c98fc3a15db4a86f193e955b0009c40e 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -55,6 +55,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["elementwise_add"]["Y"] = ScaleAlgo::KL; rules_["elementwise_add"]["Out"] = ScaleAlgo::KL; + rules_["elementwise_mul"]["X"] = ScaleAlgo::KL; + rules_["elementwise_mul"]["Y"] = ScaleAlgo::KL; + rules_["elementwise_mul"]["Out"] = ScaleAlgo::KL; + // Reshape2 does not perform calculation on the data and shapes are not // changed. Scale is calculated on input data and assign to Quantize and // Dequantize scale. @@ -63,6 +67,24 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["reshape2"]["ShapeTensor"] = ScaleAlgo::NONE; rules_["reshape2"]["XShape"] = ScaleAlgo::NONE; rules_["reshape2"]["Out"] = ScaleAlgo::NONE; + + rules_["fusion_gru"]["X"] = ScaleAlgo::KL; + rules_["fusion_gru"]["H0"] = ScaleAlgo::NONE; + rules_["fusion_gru"]["Bias"] = ScaleAlgo::NONE; + rules_["fusion_gru"]["WeightX"] = ScaleAlgo::NONE; // Weights will be handled + rules_["fusion_gru"]["WeightH"] = ScaleAlgo::NONE; // separately + rules_["fusion_gru"]["ReorderedH0"] = ScaleAlgo::NONE; + rules_["fusion_gru"]["XX"] = ScaleAlgo::NONE; + rules_["fusion_gru"]["BatchedInput"] = ScaleAlgo::NONE; + rules_["fusion_gru"]["BatchedOut"] = ScaleAlgo::NONE; + rules_["fusion_gru"]["Hidden"] = ScaleAlgo::KL; + + rules_["multi_gru"]["X"] = ScaleAlgo::KL; + rules_["multi_gru"]["Bias"] = ScaleAlgo::NONE; + rules_["multi_gru"]["WeightX"] = ScaleAlgo::NONE; // Weights will be handled + rules_["multi_gru"]["WeightH"] = ScaleAlgo::NONE; // separately + rules_["multi_gru"]["Scale_weights"] = ScaleAlgo::NONE; + rules_["multi_gru"]["Hidden"] = ScaleAlgo::KL; } ScaleAlgo MkldnnQuantizerConfig::scale_algo( diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..954a9806bec8c383db4a106091857da281dd8695 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer_tester.cc @@ -0,0 +1,300 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { + +class MkldnnQuantizerTest : public testing::Test { + public: + MkldnnQuantizerTest() { + AnalysisConfig config(FLAGS_dirname); + predictor = std::move(CreatePaddlePredictor(config)); + auto* predictor_p = static_cast(predictor.get()); + + auto qconfig = new MkldnnQuantizerConfig(); + + mkldnn_quantizer.reset( + new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig)); + } + + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + int num_bins) const { + return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins); + } + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0); + } + + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetMaxChGRUScalingFactor( + const framework::LoDTensor& wx_tensor, + const framework::LoDTensor& wh_tensor) const { + return mkldnn_quantizer->GetMaxChGRUScalingFactor(wx_tensor, wh_tensor); + } + + protected: + std::unique_ptr predictor; + std::unique_ptr mkldnn_quantizer; + float abs_error = 1e-6; + static const std::array non_negative_values; + static const std::array positive_and_negative_values; +}; + +const std::array MkldnnQuantizerTest::non_negative_values = { + 0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628, + 0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741}; +const std::array MkldnnQuantizerTest::positive_and_negative_values = + {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, + -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; + +TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) { + // all non-negative values + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 4); + ASSERT_EQ(histogram[1], 4); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) { + const auto& values = positive_and_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 3); + ASSERT_EQ(histogram[1], 5); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_zero_bins) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_empty) { + // empty tensor + ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet); + + // zero tensor + framework::LoDTensor var_tensor; + var_tensor.Resize({0}); + var_tensor.mutable_data(platform::CPUPlace()); + + ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0899106152344, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + int channels = 3; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size())); + for (int i = 0; i < channels; i++) + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace()) + + i * values.size()); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), channels); + for (int i = 0; i < channels; i++) { + ASSERT_NEAR(lod_tensor.data()[i], 1.0 / max_val, abs_error); + } +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { + const auto& values = non_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0252845321362, abs_error); +} + +const std::vector> wx = { + {0.04347931, -0.5643393, 0.7551297, 0.26713502, 0.8055306, 0.91144973}, + {0.01707571, 0.12741385, 0.15419468, 0.66127586, 0.46821925, 0.9665961}, + {0.40393898, 0.884427, -0.5853097, 0.5840954, 0.9170512, 0.98245513}}; +const std::vector> wh = { + {0.42484227, -0.9025513, 0.17087583, 0.8403284, 0.03325734, 0.92331886}, + {0.32630175, 0.41691914, 0.99848574, 0.3504407, 0.06707559, 0.62239844}}; + +TEST_F(MkldnnQuantizerTest, max_ch_gru_scaling_factor) { + framework::LoDTensor wx_tensor, wh_tensor, lod_tensor; + + wx_tensor.Resize(framework::make_dim(wx.size(), wx[0].size())); + for (size_t i = 0; i < wx.size(); i++) + std::copy( + begin(wx[i]), end(wx[i]), + wx_tensor.mutable_data(platform::CPUPlace()) + i * wx[0].size()); + + wh_tensor.Resize(framework::make_dim(wh.size(), wh[0].size())); + for (size_t i = 0; i < wh.size(); i++) + std::copy( + begin(wh[i]), end(wh[i]), + wh_tensor.mutable_data(platform::CPUPlace()) + i * wh[0].size()); + + bool is_unsigned; + std::tie(is_unsigned, lod_tensor) = + GetMaxChGRUScalingFactor(wx_tensor, wh_tensor); + + std::vector scales = {2.35381475, 1.08304947, 1.32427582, + 1.19001095, 1.00151656, 1.01785819}; + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), static_cast(scales.size())); + for (int64_t i = 0; i < lod_tensor.numel(); i++) { + ASSERT_NEAR(lod_tensor.data()[i], scales[i], abs_error); + } +} +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h index 8bad8f39f7e4c7f19f7acea9233fa1df6002fd2b..a9083a7895e4d92e86339274382bd892a1c7e926 100644 --- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -45,7 +45,9 @@ enum class ScaleAlgo { MAX_CH, ///< Find scale based on the max absolute value per output channel MAX_CH_T, ///< Find scale based on the max absolute value per output channel ///< of a transposed tensor - KL, ///< Find scale based on KL Divergence + MAX_CH_GRU, ///< Find scale based on the max absolute value per output + /// channel for fusion_gru/multi_gru operators + KL, ///< Find scale based on KL Divergence }; /// diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index f0eb0d1fa675b7e88aae44acd79e425a2bc70e47..747cf137464f9e58fc2101bdf6edf9748cb783e1 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -195,6 +195,21 @@ function(inference_analysis_api_lexical_bfloat16_test_run TARGET_NAME test_binar --iterations=2) endfunction() +function(inference_analysis_api_lexical_int8_test_run TARGET_NAME test_binary infer_model data_path fuse_multi_gru) + inference_analysis_test_run(${TARGET_NAME} + COMMAND ${test_binary} + ARGS --infer_model=${infer_model} + --infer_data=${data_path} + --batch_size=100 + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --with_accuracy_layer=true + --use_analysis=true + --enable_int8=true + --quantized_accuracy=0.01 + --fuse_multi_gru=${fuse_multi_gru} + --iterations=4) +endfunction() + function(preprocess_data2bin_test_run target py_script_source data_dir output_file) py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/${py_script_source} ARGS --data_dir=${data_dir} @@ -475,6 +490,10 @@ if(WITH_MKLDNN) inference_analysis_api_lexical_test_run(test_analyzer_lexical_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH}) # run bfloat16 lexical analysis test inference_analysis_api_lexical_bfloat16_test_run(test_analyzer_lexical_gru_bfloat16 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH}) + # run post-training quantization lexical analysis test + inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8 ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} false) + # run post-training quantization lexical analysis test with multi_gru fuse + inference_analysis_api_lexical_int8_test_run(test_analyzer_lexical_gru_int8_multi_gru ${LEXICAL_TEST_APP} ${GRU_MODEL_PATH} ${GRU_DATA_PATH} true) ### optimized FP32 vs. Quant INT8 tests diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc index 720c90090cf746121ee79b44bd3c9ab35b736dba..cca8ac2634c6cce278c0148bf76ccb5df985b67f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc @@ -54,6 +54,51 @@ std::vector ReadSentenceLod(std::ifstream &file, size_t offset, return sentence_lod; } +std::shared_ptr> WarmupData( + const std::vector> &test_data, + int num_images = 1) { + int data_size = test_data.size(); + + PADDLE_ENFORCE_LE(static_cast(num_images), data_size, + platform::errors::InvalidArgument( + "The requested quantization warmup data size must be " + "lower or equal to the test data size. But received" + "warmup size is %d and test data size is %d", + num_images, data_size)); + int words_shape = test_data[0][0].shape[0]; + PaddleTensor words; + words.name = "words"; + words.shape = {words_shape, 1}; + words.dtype = PaddleDType::INT64; + words.data.Resize(sizeof(int64_t) * words_shape); + + int target_shape = test_data[0][1].shape[0]; + PaddleTensor targets; + targets.name = "targets"; + targets.shape = {target_shape, 1}; + targets.dtype = PaddleDType::INT64; + targets.data.Resize(sizeof(int64_t) * target_shape); + + for (int i = 0; i < num_images; i++) { + std::copy_n( + static_cast(test_data[i][0].data.data()) + i * words_shape, + words_shape, + static_cast(words.data.data()) + i * words_shape); + words.lod = test_data[i][0].lod; + + std::copy_n( + static_cast(test_data[i][1].data.data()) + i * target_shape, + target_shape, + static_cast(targets.data.data()) + i * target_shape); + targets.lod = test_data[i][1].lod; + } + + auto warmup_data = std::make_shared>(2); + (*warmup_data)[0] = std::move(words); + (*warmup_data)[1] = std::move(targets); + return warmup_data; +} + template class TensorReader { public: @@ -210,7 +255,19 @@ TEST(Analyzer_lexical_test, Analyzer_lexical_analysis) { if (FLAGS_use_analysis) { AnalysisConfig analysis_cfg; SetAnalysisConfig(&analysis_cfg, FLAGS_cpu_num_threads); - if (FLAGS_enable_bf16) analysis_cfg.EnableMkldnnBfloat16(); + if (FLAGS_enable_bf16) { + analysis_cfg.EnableMkldnnBfloat16(); + } else if (FLAGS_enable_int8) { + if (FLAGS_fuse_multi_gru) + analysis_cfg.pass_builder()->AppendPass("multi_gru_fuse_pass"); + + std::shared_ptr> warmup_data = + WarmupData(input_slots_all); + analysis_cfg.EnableMkldnnQuantizer(); + analysis_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); + analysis_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize( + FLAGS_batch_size); + } std::vector acc_analysis(3); acc_analysis = Lexical_Test(input_slots_all, &outputs, &analysis_cfg, true); for (size_t i = 0; i < acc_analysis.size(); i++) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index dbc2acbed8367a949857bb56fb83fd592bffaa3f..8e6b8b197d7f24a76401a04afa02ba4c36712e5d 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -73,6 +73,8 @@ DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup."); DEFINE_bool(enable_profile, false, "Turn on profiler for fluid"); DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance."); +DEFINE_bool(fuse_multi_gru, false, + "Running the inference program with multi_gru_fuse_pass"); namespace paddle { namespace inference {