From fed0ed34e9ad221289e24d80f03834cbcd6ec16e Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Mon, 21 Nov 2022 12:08:53 +0100 Subject: [PATCH] add fc-residual quantization (#46917) * add fc-residual quantization * revert removal of check for use_mkldnn * fix bug * add disable_logs * review fix call twice AreScalesPresntForNodes instead of if-else * rewrite residual input to output * revert fc mkldnn taking residual data * format fix * fix LoDTensor->DenseTensor * LoDTensor->DenseTensor * output->input * revert changes to unsupported script revert changes to unsupported script * remove fc residualdata from output blocklist in cpu_bfloat16_pass.cc --- .../framework/ir/graph_pattern_detector.cc | 15 +-- .../framework/ir/mkldnn/cpu_bfloat16_pass.cc | 1 - .../framework/ir/mkldnn/cpu_quantize_pass.cc | 35 +++++- .../framework/ir/mkldnn/cpu_quantize_pass.h | 4 +- .../ir/mkldnn/cpu_quantize_squash_pass.cc | 7 +- .../fc_elementwise_add_mkldnn_fuse_pass.cc | 2 +- .../framework/ir/mkldnn/fc_mkldnn_pass.cc | 35 ++---- .../framework/ir/mkldnn/fc_mkldnn_pass.h | 1 - .../inference/api/paddle_pass_builder.cc | 1 + paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 113 ++++++++++-------- 10 files changed, 117 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index cb131f8ec1..7f509d64b5 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1163,21 +1163,12 @@ PDNode *patterns::FCMKLDNN::operator()(bool with_residual_data) { if (with_residual_data) { auto res_fc_var = pattern->NewNode(residual_data_repr()) ->AsInput() - ->assert_is_op_input("fc") - // assert_is_op_input with two arguments doesn't work - // because ResidualData in FC is set as output with - // SetOutput so we do custom assert output - ->assert_more([&](Node *x) { - for (auto *op : x->outputs) - if (IsNthOutput(x, op, "ResidualData", 0)) - return true; - return false; - }); + ->assert_is_op_input("fc", "ResidualData"); links_from.push_back(res_fc_var); } else { fc_op->assert_more([&](Node *x) { - if (!HasOutput(x, "ResidualData") || - x->Op()->Output("ResidualData").size() == 0) + if (!HasInput(x, "ResidualData") || + x->Op()->Input("ResidualData").size() == 0) return true; return false; }); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index d64fbe16a3..ba8bacd200 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -200,7 +200,6 @@ class DeQuantizer final : public Quanter { std::unordered_map> block_list{ {"layer_norm", {"Mean", "Variance"}}, // not used in inference in MKLDNN - {"fc", {"ResidualData"}}, // artifical output, already dequantized {"matmul", {"ResidualData"}}, // artifical output, already dequantized {"matmul_v2", {"ResidualData"}}}; // artifical output, already dequantized diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index efdba4f44f..ac509aa604 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -515,16 +515,17 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ((with_residual_data) ? "with residual connection" : "")); } -void CPUQuantizePass::QuantizeFc(Graph* graph) const { +void CPUQuantizePass::QuantizeFc(Graph* graph, bool with_residual_data) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); patterns::FCMKLDNN fc_pattern{pattern, name_scope_}; - fc_pattern(false /* with_residual */); + fc_pattern(with_residual_data); int quantize_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize fc op"; + VLOG(4) << "Quantize fc op " << (with_residual_data ? "with" : "without") + << " residual data"; GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern); // skip if should not be quantized @@ -532,6 +533,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { LogQuantizationDisabled(fc); return; } + if (!fc->Op()->GetAttrIfExists("use_mkldnn")) { MarkAndLogCannotQuantizeOp(fc, "use_mkldnn attribute set to false"); return; @@ -546,6 +548,26 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { return; } + if (with_residual_data) { + GET_IR_NODE_FROM_SUBGRAPH(residual_data, residual_data, fc_pattern); + if (!AreScalesPresentForNodes({residual_data})) { + MarkAndLogCannotQuantizeOp(fc, "No scale available for the operator"); + return; + } + + bool is_residual_unsigned{false}; + auto residual_scale = + GetScaleValueForNode(residual_data, &is_residual_unsigned); + + QuantizeInput(g, + fc, + residual_data, + "ResidualData", + residual_scale, + is_residual_unsigned, + "Scale_in_eltwise"); + } + bool is_input_unsigned{false}; auto input_scale = GetScaleValueForNode(input, &is_input_unsigned); QuantizeInput( @@ -576,7 +598,9 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { gpd(graph, handler); AddStatis(quantize_fc_count); - LogQuantizedOpsCounter("fc", quantize_fc_count); + LogQuantizedOpsCounter("fc", + quantize_fc_count, + with_residual_data ? "with residual connection" : ""); } void CPUQuantizePass::QuantizePool(Graph* graph) const { @@ -1228,7 +1252,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizePool(graph); QuantizeConcat(graph); QuantizePriorBox(graph); - QuantizeFc(graph); + QuantizeFc(graph, false /* with_residual_data */); + QuantizeFc(graph, true /* with_residual_data */); QuantizeMatmul(graph, false /* with_residual_data */); QuantizeMatmul(graph, true /* with_residual_data */); QuantizeImmutable(graph, "reshape2", "X"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 64f9b11ee9..b3c5312197 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -49,8 +49,8 @@ class CPUQuantizePass : public FusePassBase { protected: void ApplyImpl(ir::Graph* graph) const override; - void QuantizeConv(Graph* graph, bool with_residual_data = false) const; - void QuantizeFc(Graph* graph) const; + void QuantizeConv(Graph* graph, bool with_residual_data) const; + void QuantizeFc(Graph* graph, bool with_residual_data) const; void QuantizePool(Graph* graph) const; void QuantizeConcat(Graph* graph) const; void QuantizePriorBox(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index e0a64b2036..b0ccbb8aa9 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -337,7 +337,8 @@ void CPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const { if (dequant_in->outputs.size() == 1) { if (any_op->Op()->Type() == "conv2d" || - any_op->Op()->Type() == "conv2d_transpose") { + any_op->Op()->Type() == "conv2d_transpose" || + any_op->Op()->Type() == "fc") { // do not squash if fuse residual connection is true // because residual fusion does not support force output with fp32 if (any_op->Op()->GetAttrIfExists("fuse_residual_connection")) @@ -418,8 +419,8 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const { last_op_names.begin(), last_op_names.end(), quant_out->Name()), last_op_names.end()); last_op_names.push_back(first_quant_out->Name()); - last_op->Op()->SetInput(last_op_input_name, - std::vector(last_op_names)); + last_op_op->SetInput(last_op_input_name, + std::vector(last_op_names)); IR_NODE_LINK_TO(first_quant_out, last_op); GraphSafeRemoveNodes(graph, {quant_op, quant_out}); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc index 7b0951b9c7..9ddf9e161d 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc @@ -119,7 +119,7 @@ GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC( return; } - fc_op->Op()->SetOutput("ResidualData", {residual_data->Name()}); + fc_op->Op()->SetInput("ResidualData", {residual_data->Name()}); fc_op->Op()->SetOutput("Out", {elementwise_out->Name()}); fc_op->Op()->SetAttr("fuse_residual_connection", true); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc index a2f8c14d1a..ceb73b0911 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc @@ -29,18 +29,16 @@ namespace ir { class Graph; -namespace { -void LogEnabledOps(const int counter, const std::string& details) { - std::string msg_ss{"--- enabled FC MKL-DNN for "}; - msg_ss += counter + " fc ops " + details; - string::PrettyLogDetail(msg_ss.c_str()); -} -} // namespace +void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL(graph, + platform::errors::InvalidArgument( + "Pointer to graph argument should not be NULL.")); + Init("fc_mkldnn_pass", graph); -void FCMKLDNNPass::ApplyPass(ir::Graph* graph, bool with_residual) const { GraphPatternDetector gpd; patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_mkldnn_pass"); - fc_pattern(with_residual); + // searching for fc+residual doesn't make sense at this stage + fc_pattern(false /*with_residual*/); int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -79,19 +77,12 @@ void FCMKLDNNPass::ApplyPass(ir::Graph* graph, bool with_residual) const { AddStatis(found_fc_count); - LogEnabledOps(found_fc_count, - (with_residual ? "with residual connection" - : "without residual connection")); -} - -void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL(graph, - platform::errors::InvalidArgument( - "Pointer to graph argument should not be NULL.")); - Init("fc_mkldnn_pass", graph); - - ApplyPass(graph, true); - ApplyPass(graph, false); + if ((!Has("disable_logs") || !Get("disable_logs")) && + (found_fc_count > 0)) { + std::string msg_ss = "--- enabled FC MKL-DNN for " + + std::to_string(found_fc_count) + " fc ops "; + string::PrettyLogDetail(msg_ss.c_str()); + } } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h index 9367e08e7c..df02250394 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h @@ -34,7 +34,6 @@ class FCMKLDNNPass : public FusePassBase { protected: void ApplyImpl(ir::Graph* graph) const; - void ApplyPass(ir::Graph* graph, bool with_residual) const; }; } // namespace ir diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 19fd7279b9..062264222b 100755 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -439,6 +439,7 @@ void CpuPassStrategy::EnableMkldnnInt8() { passes_.push_back("repeated_fc_relu_fuse_pass"); passes_.push_back("fc_mkldnn_pass"); passes_.push_back("fc_act_mkldnn_fuse_pass"); + passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass"); passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass"); passes_.push_back("batch_norm_act_fuse_pass"); passes_.push_back("softplus_activation_mkldnn_fuse_pass"); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index a9d1e6e9d5..6a6704c094 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -103,15 +103,16 @@ class FCMKLDNNHandler dnnl::primitive_attr attributes; dnnl::post_ops post_operations; - std::vector output_shift_scale; - float scale = 1.0f; + float sum_scale = 1.0f; + float activation_scale = 1.0f; if (phi::funcs::is_int8()) { - std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx); + std::vector output_shift_scale; + std::tie(output_shift_scale, sum_scale, activation_scale) = + GetOutputScales(ctx); int mask = CreateMask(1, output_shift_scale.size() > 1); attributes.set_output_scales(mask, output_shift_scale); } - float sum_scale = 1.0f; if (ctx.HasAttr("fuse_residual_connection") && ctx.Attr("fuse_residual_connection")) { post_operations.append_sum(sum_scale); @@ -120,9 +121,9 @@ class FCMKLDNNHandler // ReLU from "fc_fuse_pass" if (ctx.Attr("activation_type") == "relu") { post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f); + activation_scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f); } - platform::AppendActivation(ctx, post_operations, scale); + platform::AppendActivation(ctx, post_operations, activation_scale); if (ctx.HasAttr("fused_output_scale")) { float scale_alpha = ctx.Attr("fused_output_scale"); @@ -136,18 +137,22 @@ class FCMKLDNNHandler // Compute the bias scales so that its values correspond to the // scale of data being an output of weights and input multiplication - std::vector ComputeBiasScales( - const float scale_in, const std::vector& scale_weights) { - std::vector bias_scales(scale_weights.size()); - - for (size_t i = 0; i < bias_scales.size(); ++i) { - if (scale_weights[i] == 0.0) - bias_scales[i] = 1.0f; - else - bias_scales[i] = scale_in * scale_weights[i]; + std::vector GetBiasScales(const framework::ExecutionContext& ctx) { + if (ctx.HasAttr("Bias_scales")) { + return ctx.Attr>("Bias_scales"); + } else { + const float scale_in = ctx.Attr("Scale_in"); + const auto& scale_weights = ctx.Attr>("Scale_weights"); + std::vector bias_scales(scale_weights.size()); + + for (size_t i = 0; i < bias_scales.size(); ++i) { + if (scale_weights[i] == 0.0) + bias_scales[i] = 1.0f; + else + bias_scales[i] = scale_in * scale_weights[i]; + } + return bias_scales; } - - return bias_scales; } // Correct output scale, to take into account scaling of input and weights @@ -155,32 +160,44 @@ class FCMKLDNNHandler // scaled with its own scales, this data needs to be divided by // those scales to normalise them back to what their floating-point range // was. Then we multiply them by desired output scale we want on the output. - std::tuple, float> ComputeOutputShiftScale( + std::tuple, float, float> GetOutputScales( const ExecutionContext& ctx) { - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - bool has_activation = !ctx.Attr("activation_type").empty(); - bool force_fp32_output = ctx.Attr("force_fp32_output"); - - // If the output will be in floats, we don't multiply by scale_out. - - float scale = (!force_fp32_output && has_activation) - ? ctx.Attr("Scale_out") - : 1.0f; - float inner_scale = (force_fp32_output || has_activation) - ? 1.0f - : ctx.Attr("Scale_out"); - const size_t weight_scales_num = scale_weights_data.size(); - - for (size_t i = 0; i < weight_scales_num; ++i) { - if (scale_weights_data[i] == 0.0) - scale_weights_data[i] = inner_scale; - else - scale_weights_data[i] = - inner_scale / (scale_in_data * scale_weights_data[i]); + if (ctx.HasAttr("Sum_scale")) { + return std::make_tuple(ctx.Attr>("Output_shift_scale"), + ctx.Attr("Sum_scale"), + ctx.Attr("Activation_scale")); + } else { + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool has_activation = !ctx.Attr("activation_type").empty(); + bool force_fp32_output = ctx.Attr("force_fp32_output"); + bool fuse_residual_conn = ctx.HasAttr("fuse_residual_connection") && + ctx.Attr("fuse_residual_connection"); + auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise") + ? ctx.Attr("Scale_in_eltwise") + : 1.0f; + + // If the output will be in floats, we don't multiply by scale_out. + + float activation_scale = (!force_fp32_output && has_activation) + ? ctx.Attr("Scale_out") + : 1.0f; + float scale_out_data = (force_fp32_output || has_activation) + ? 1.0f + : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + const size_t weight_scales_num = scale_weights_data.size(); + + for (size_t i = 0; i < weight_scales_num; ++i) { + if (scale_weights_data[i] == 0.0) + scale_weights_data[i] = scale_out_data; + else + scale_weights_data[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + return std::make_tuple(scale_weights_data, sum_scale, activation_scale); } - - return make_tuple(scale_weights_data, scale); } // Computing MKL-DNN's scaling mask which determines along which dimension @@ -240,9 +257,7 @@ class FCMKLDNNHandler } std::shared_ptr AcquireBiasMemoryWithReorder( - const phi::DenseTensor* bias, - const float scale_in, - const std::vector& scale_weights) { + const framework::ExecutionContext& ctx, const phi::DenseTensor* bias) { const float* bias_data = bias->data(); if (phi::funcs::is_int8() == false) { @@ -255,7 +270,7 @@ class FCMKLDNNHandler this->dev_ctx_.GetBlob(bias_key)); if (!memory_p) { - const auto& scale_data = ComputeBiasScales(scale_in, scale_weights); + const auto& scale_data = GetBiasScales(ctx); dnnl::primitive_attr attrs; int mask = CreateMask(0, scale_data.size() > 1); @@ -316,7 +331,7 @@ class FCMKLDNNHandler const ExecutionContext& ctx, phi::DenseTensor* out) { if (ctx.HasAttr("fuse_residual_connection") && ctx.Attr("fuse_residual_connection")) { - auto* residual_param = ctx.Output("ResidualData"); + auto* residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ( out->dims(), @@ -393,7 +408,6 @@ class FCMKLDNNKernel : public framework::OpKernel { const auto* bias = ctx.Input("Bias"); auto out = ctx.Output("Out"); - const float scale_in = ctx.Attr("Scale_in"); const auto& scale_weights = ctx.Attr>("Scale_weights"); std::shared_ptr fc_p; @@ -430,7 +444,7 @@ class FCMKLDNNKernel : public framework::OpKernel { std::make_shared(inner_product_cache->dst_mem); if (ctx.HasAttr("fuse_residual_connection") && ctx.Attr("fuse_residual_connection")) { - auto* residual_param = ctx.Output("ResidualData"); + auto* residual_param = ctx.Input("ResidualData"); out->ShareDataWith(*residual_param); } auto out_ptr = out->mutable_data( @@ -460,8 +474,7 @@ class FCMKLDNNKernel : public framework::OpKernel { dst_memory_p = handler.AcquireCustomDstMemory(ctx, out); if (bias) { - bias_memory_p = - handler.AcquireBiasMemoryWithReorder(bias, scale_in, scale_weights); + bias_memory_p = handler.AcquireBiasMemoryWithReorder(ctx, bias); } fc_p = handler.AcquireForwardPrimitive(); -- GitLab