diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index c7eb058701430df93a75f63cdfcf44164659339a..89f51bfa2a4b4edd796ae5db777a01a5f3a64fba 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -77,7 +77,9 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, VarQuantScale* scales, bool are_unsigned, std::string scale_attr_name) const { auto inputs = op->inputs; + auto output = op->outputs[0]; PADDLE_ENFORCE_GE(inputs.size(), 1); + PADDLE_ENFORCE_EQ(op->outputs.size(), 1); // create a quantize op desc prototype OpDesc q_desc; @@ -86,13 +88,9 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, std::vector quantize_out_nodes(inputs.size()); std::vector quantize_out_node_names(inputs.size()); - double scale_min = std::numeric_limits::max(); - for (const auto& input : inputs) { - double scale = (*scales)[input->Name()].second.data()[0]; - if (scale < scale_min) scale_min = scale; - } + double scale_out = (*scales)[output->Name()].second.data()[0]; unsigned max = are_unsigned ? U8_MAX : S8_MAX; - float scale = scale_min * max; + float scale = scale_out * max; for (size_t i = 0; i < inputs.size(); i++) { // Create quantize output variable diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index d3c1fa7117f6fead0d24840a08d462e79251fac9..fea56f01cb5e665b7fab1c0c2068a0d9b91e89b3 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/api/mkldnn_quantizer.h" #include +#include #include #include #include @@ -37,6 +38,7 @@ using framework::ir::Graph; using ConstEigenVectorArrayMap = Eigen::Map>; using string::PrettyLogH1; +static LoDTensor CreateScaleTensor(int64_t channels_num = 1); bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { PrettyLogH1("--- Calculating scales for quantization"); @@ -52,7 +54,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { for (auto const& conn : connections) { for (const auto& var_name : conn.second) { // skip if scale already computed - if (scales_.find(var_name) != scales_.end()) return; + if (scales_.find(var_name) != scales_.end()) continue; auto* var = predictor_.sub_scope_->FindVar(var_name); PADDLE_ENFORCE(var, "%s is not in the scope", var_name); @@ -62,29 +64,49 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { // force unsigned type if already know it bool is_unsigned = false; - if (is_output && op->Type() == "conv2d") { - // output of conv2d with relu must be unsigned - is_unsigned = (op->HasAttr("fuse_relu") && - boost::get(op->GetAttr("fuse_relu"))) || - (op->HasAttr("fuse_brelu") && - boost::get(op->GetAttr("fuse_brelu"))); - } else if (is_output && op->Type() == "relu") { - is_unsigned = true; - } else if (is_output && - (op->Type() == "pool2d" || op->Type() == "transpose2" || - op->Type() == "reshape2" || op->Type() == "concat")) { - // output of ops with unsigned input must be unsigned - is_unsigned = true; - for (auto input_var_name : op->Input("X")) { + bool compute_scale = true; + if (is_output) { + if (op->Type() == "conv2d") { + // output of conv2d with relu must be unsigned + is_unsigned = (op->HasAttr("fuse_relu") && + boost::get(op->GetAttr("fuse_relu"))) || + (op->HasAttr("fuse_brelu") && + boost::get(op->GetAttr("fuse_brelu"))); + } else if (op->Type() == "relu") { + is_unsigned = true; + } else if (op->Type() == "transpose2" || + op->Type() == "reshape2" || op->Type() == "pool2d") { + auto input_var_name = op->Input("X")[0]; PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(), "Input scales must be calculated before the " "output scales to infer if output is unsigned."); - is_unsigned = is_unsigned && scales_[input_var_name].first; + if (scales_.find(input_var_name) != scales_.end()) { + scales_[var_name] = scales_[input_var_name]; + } + compute_scale = false; + } else if (op->Type() == "concat") { + // output of ops with unsigned input must be unsigned + is_unsigned = true; + double min_scale = std::numeric_limits::max(); + for (auto input_var_name : op->Input("X")) { + PADDLE_ENFORCE( + scales_.find(input_var_name) != scales_.end(), + "Input scales must be calculated before the " + "output scales to infer if output is unsigned."); + is_unsigned = is_unsigned && scales_[input_var_name].first; + min_scale = std::min( + min_scale, + scales_[input_var_name].second.data()[0]); + } + auto scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = min_scale; + scales_[var_name] = {is_unsigned, scale_tensor}; + compute_scale = false; } } - - CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor, - is_unsigned); + if (compute_scale) + CalculateSingleScale(op->Type(), conn.first, var_name, + *var_tensor, is_unsigned); } } }; @@ -127,6 +149,13 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale( } } +static LoDTensor CreateScaleTensor(int64_t channels_num) { + LoDTensor scale_tensor; + scale_tensor.Resize({channels_num}); + scale_tensor.mutable_data(CPUPlace()); + return scale_tensor; +} + std::vector AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins( std::vector quantized_bins, std::vector reference_bins) const { std::vector expanded_quantized_bins(reference_bins.size(), 0); @@ -263,11 +292,8 @@ AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor( min_kl_index = starting_iter; } - LoDTensor scale_tensor; - scale_tensor.Resize({1}); - auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); - - scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width); + LoDTensor scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = 1.0 / ((min_kl_index + 0.5) * bin_width); return std::make_pair(is_unsigned, scale_tensor); } @@ -285,10 +311,8 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", min_val); - LoDTensor scale_tensor; - scale_tensor.Resize({1}); - auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); - scale_ptr[0] = 1.0 / max_abs; + LoDTensor scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = 1.0 / max_abs; return std::make_pair(is_unsigned, scale_tensor); } @@ -308,8 +332,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( min_val); int channels = var_tensor.dims()[0]; - LoDTensor scale_tensor; - scale_tensor.Resize({channels}); + LoDTensor scale_tensor = CreateScaleTensor(channels); auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); for (int i = 0; i < channels; ++i) {