From 7c8c9b7d87b51838ae6ed379ac9bc3d5685b7bee Mon Sep 17 00:00:00 2001 From: leolishaohao <138780481+leolishaohao@users.noreply.github.com> Date: Wed, 6 Sep 2023 16:13:28 +0800 Subject: [PATCH] [XPU] add squeeze_excitation_block_xpu op&pass to optimize ppocr_v3_det model (#56773) * [XPU] add squeeze_excitation_block_xpu op&pass to optimize ppocr_v3_det model test=kunlun * fix * fix Codestype * remove xpu name --- paddle/fluid/framework/ir/CMakeLists.txt | 6 + .../ir/xpu/squeeze_excitation_fuse_pass.cc | 567 ++++++++++++++++++ .../ir/xpu/squeeze_excitation_fuse_pass.h | 82 +++ .../xpu/squeeze_excitation_fuse_pass_test.cc | 63 ++ .../inference/api/paddle_pass_builder.cc | 1 + paddle/phi/api/yaml/fused_ops.yaml | 10 + paddle/phi/backends/xpu/xpu2_op_list.cc | 1 + paddle/phi/infermeta/fusion.cc | 25 + paddle/phi/infermeta/fusion.h | 10 + .../squeeze_excitation_block_xpu_kernel.cc | 126 ++++ .../test_xpu_squeeze_excitation_fuse_pass.py | 163 +++++ 11 files changed, 1054 insertions(+) create mode 100644 paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass_test.cc create mode 100644 paddle/phi/kernels/fusion/xpu/squeeze_excitation_block_xpu_kernel.cc create mode 100644 test/ir/inference/test_xpu_squeeze_excitation_fuse_pass.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index f856eca2aa5..b6143f335d1 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -290,6 +290,8 @@ if(WITH_XPU) pass_library(fast_where_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(fast_layernorm_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(squeeze_excitation_fuse_pass inference DIR xpu DEPS + ${XPU_PASS_DEPS}) pass_library(elementwise_mul_add_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) endif() @@ -615,4 +617,8 @@ if(WITH_XPU) test_fast_where_xpu_fuse_pass SRCS xpu/fast_where_xpu_fuse_pass_test.cc DEPS fast_where_xpu_fuse_pass) + cc_test( + test_squeeze_excitation_fuse_pass + SRCS xpu/squeeze_excitation_fuse_pass_test.cc + DEPS squeeze_excitation_fuse_pass) endif() diff --git a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc new file mode 100644 index 00000000000..60bb47156a6 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.cc @@ -0,0 +1,567 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h" +#include + +#include "glog/logging.h" + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace math { + +template +static inline void Transpose(const T* in, T* out, int h, int w) { + for (int h1 = 0; h1 < w; ++h1) { + for (int w1 = 0; w1 < h; ++w1) { + out[h1 * h + w1] = in[w1 * w + h1]; + } + } +} + +} // namespace math + +namespace patterns { + +struct SqueezeExcitationFusePattern : public PatternBase { + SqueezeExcitationFusePattern(PDPattern* pattern, + const std::string& name_scope, + const std::string& op_type, + const std::string& act_type, + bool with_branch, + bool with_bias); + + // declare operator node`s name + PATTERN_DECL_NODE(pool2d); + PATTERN_DECL_NODE(mul_1); + PATTERN_DECL_NODE(mul_2); + PATTERN_DECL_NODE(ew_mul); + PATTERN_DECL_NODE(ew_branch_add); + PATTERN_DECL_NODE(block_act); + + // declare variable node`s name + PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(pool2d_out); + PATTERN_DECL_NODE(mul_1_w); + PATTERN_DECL_NODE(mul_1_w_max); + PATTERN_DECL_NODE(mul_1_bias); + PATTERN_DECL_NODE(mul_1_out); + PATTERN_DECL_NODE(mul_1_out_max); + PATTERN_DECL_NODE(mul_2_w); + PATTERN_DECL_NODE(mul_2_w_max); + PATTERN_DECL_NODE(mul_2_bias); + PATTERN_DECL_NODE(mul_2_out); + PATTERN_DECL_NODE(mul_2_out_max); + PATTERN_DECL_NODE(ew_mul_out); + PATTERN_DECL_NODE(ew_branch_add_in); + PATTERN_DECL_NODE(ew_branch_add_out); + PATTERN_DECL_NODE(block_act_out); +}; + +SqueezeExcitationFusePattern::SqueezeExcitationFusePattern( + PDPattern* pattern, + const std::string& name_scope, + const std::string& op_type, + const std::string& act_type, + bool with_branch, + bool with_bias) + : PatternBase(pattern, name_scope, name_scope) { + auto* x = pattern->NewNode(x_repr()) + ->assert_is_op_input("pool2d", "X") + ->assert_is_op_input("elementwise_mul", "X") + ->AsInput(); + + auto pool2d_teller = [](const Node* x) { + auto* op_desc = x->Op(); + bool has_adap = op_desc->HasAttr("adaptive"); + if (has_adap) { + auto ksize = + PADDLE_GET_CONST(std::vector, op_desc->GetAttr("ksize")); + if (ksize[0] != 1 || ksize[1] != 1) { + return false; + } + } else if (PADDLE_GET_CONST(bool, op_desc->GetAttr("global_pooling")) == + false) { + return false; + } + return true; + }; + + auto* pool2d = pattern->NewNode(pool2d_repr()) + ->assert_is_op("pool2d") + ->assert_op_attr("pooling_type", "avg") + ->assert_more(pool2d_teller); + + auto* pool2d_out = pattern->NewNode(pool2d_out_repr()) + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input(op_type, "x"); + + auto mul_w_teller = [](const Node* x) { + auto* var_desc = x->Var(); + auto filter_dims = var_desc->GetShape(); + auto in_c = filter_dims[0]; + auto out_c = filter_dims[1]; + auto bigger = std::max(in_c, out_c); + auto smaller = std::min(in_c, out_c); + if (bigger % smaller != 0) { + return false; + } + return true; + }; + + auto* mul_1 = pattern->NewNode(mul_1_repr())->assert_is_op(op_type); + auto* mul_1_w = pattern->NewNode(mul_1_w_repr()) + ->assert_is_op_input(op_type, "filter") + ->assert_more(mul_w_teller); + auto* mul_1_w_max = pattern->NewNode(mul_1_w_max_repr()) + ->assert_is_op_input(op_type, "filter_max"); + auto* mul_1_out = pattern->NewNode(mul_1_out_repr()) + ->assert_is_op_output(op_type, "out") + ->assert_is_op_input(op_type, "x"); + auto* mul_1_out_max = pattern->NewNode(mul_1_out_max_repr()) + ->assert_is_op_output(op_type, "out_max"); + auto* mul_2 = pattern->NewNode(mul_2_repr())->assert_is_op(op_type); + auto* mul_2_w = pattern->NewNode(mul_2_w_repr()) + ->assert_is_op_input(op_type, "filter") + ->assert_more(mul_w_teller); + auto* mul_2_w_max = pattern->NewNode(mul_2_w_max_repr()) + ->assert_is_op_input(op_type, "filter_max"); + auto* mul_2_out = pattern->NewNode(mul_2_out_repr()) + ->assert_is_op_output(op_type, "out") + ->assert_is_op_input("elementwise_mul", "Y"); + auto* mul_2_out_max = pattern->NewNode(mul_2_out_max_repr()) + ->assert_is_op_output(op_type, "out_max"); + + PDNode* mul_1_bias = nullptr; + PDNode* mul_2_bias = nullptr; + if (with_bias) { + mul_1_bias = pattern->NewNode(mul_1_bias_repr()) + ->assert_is_op_input(op_type, "bias"); + mul_2_bias = pattern->NewNode(mul_2_bias_repr()) + ->assert_is_op_input(op_type, "bias"); + } + auto* ew_mul = + pattern->NewNode(ew_mul_repr())->assert_is_op("elementwise_mul"); + auto* ew_mul_out = pattern->NewNode(ew_mul_out_repr()) + ->assert_is_op_output("elementwise_mul", "Out"); + + // branch + PDNode* ew_branch_add_in = nullptr; + PDNode* ew_branch_add = nullptr; + PDNode* ew_branch_add_out = nullptr; + if (with_branch) { + ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr()) + ->assert_is_op_input("elementwise_add", "X") + ->AsInput(); + ew_branch_add = + pattern->NewNode(ew_branch_add_repr())->assert_is_op("elementwise_add"); + ew_branch_add_out = pattern->NewNode(ew_branch_add_out_repr()) + ->assert_is_op_output("elementwise_add", "out"); + } + // act + PDNode* block_act = nullptr; + PDNode* block_act_out = nullptr; + if (act_type != "linear") { + block_act = pattern->NewNode(block_act_repr())->assert_is_op(act_type); + block_act_out = pattern->NewNode(block_act_out_repr()) + ->assert_is_op_output(act_type, "Out"); + } + + // pass + pool2d->LinksFrom({x}).LinksTo({pool2d_out}); + mul_1->LinksFrom({mul_1_w, mul_1_w_max, pool2d_out}) + .LinksTo({mul_1_out, mul_1_out_max}); + mul_2->LinksFrom({mul_2_w, mul_2_w_max, mul_1_out}) + .LinksTo({mul_2_out, mul_2_out_max}); + ew_mul->LinksFrom({x, mul_2_out}).LinksTo({ew_mul_out}); + + if (with_branch) { + ew_mul_out->assert_is_op_input("elementwise_add", "Y"); + ew_branch_add->LinksFrom({ew_mul_out, ew_branch_add_in}) + .LinksTo({ew_branch_add_out}); + } else { + ew_branch_add_out = ew_mul_out; + } + if (act_type != "linear") { + ew_branch_add_out->assert_is_op_input(act_type, "X"); + block_act->LinksFrom({ew_branch_add_out}).LinksTo({block_act_out}); + } else { + block_act_out = ew_branch_add_out; + } + if (with_bias) { + mul_1->LinksFrom({mul_1_bias}); + mul_2->LinksFrom({mul_2_bias}); + } +} + +} // namespace patterns + +void SqueezeExcitationFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, + platform::errors::PreconditionNotMet("graph should not be null. ")); + Init(name_scope_, graph); + + int found_subgraph_count = 0; + for (auto with_branch : {true, false}) { + for (auto with_bias : {true, false}) { + for (auto op_type : {"conv2d_xpu"}) { + for (auto act_type : {"relu", + "sigmoid", + "tanh", + "leaky_relu", + "hard_swish", + "hard_sigmoid", + "relu6", + "linear"}) { + found_subgraph_count += + ApplyImpl(graph, op_type, act_type, with_branch, with_bias); + } + } + } + } + AddStatis(found_subgraph_count); +} + +int SqueezeExcitationFusePass::ApplyImpl(ir::Graph* graph, + const std::string& op_type, + const std::string& act_type, + bool with_branch, + bool with_bias) const { + GraphPatternDetector gpd; + patterns::SqueezeExcitationFusePattern pattern(gpd.mutable_pattern(), + name_scope_, + op_type, + act_type, + with_branch, + with_bias); + + int found_subgraph_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle SqueezeExcitationFusePass"; + /* declare operator node's name */ + GET_IR_NODE(pool2d); + GET_IR_NODE(mul_1); + GET_IR_NODE(mul_2); + GET_IR_NODE(ew_mul); + GET_IR_NODE(ew_branch_add); + GET_IR_NODE(block_act) + /* declare variable node's name*/ + GET_IR_NODE(x); + GET_IR_NODE(mul_1_w); + GET_IR_NODE(mul_1_w_max); + GET_IR_NODE(mul_1_bias); + GET_IR_NODE(mul_1_out); + GET_IR_NODE(mul_2_w); + GET_IR_NODE(mul_2_w_max); + GET_IR_NODE(mul_2_bias); + GET_IR_NODE(mul_2_out); + GET_IR_NODE(ew_mul_out); + GET_IR_NODE(ew_branch_add_in); + GET_IR_NODE(ew_branch_add_out); + GET_IR_NODE(block_act_out); + + auto* block = pool2d->Op()->Block(); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); + + framework::OpDesc fused_op_desc(block); + fused_op_desc.SetType("squeeze_excitation_block"); + fused_op_desc.SetInput("x", {x->Name()}); + if (with_branch) { + fused_op_desc.SetInput("branch", {ew_branch_add_in->Name()}); + } + // filter + auto mul_1_w_name = mul_1_w->Name(); + auto* mul_1_w_t = + scope->FindVar(mul_1_w_name)->GetMutable(); + auto mul_1_w_dims = mul_1_w_t->dims(); + auto mul_1_w_len = mul_1_w_t->numel(); + int16_t* mul_1_w_ptr = mul_1_w_t->data(); + auto* mul_2_w_t = + scope->FindVar(mul_2_w->Name())->GetMutable(); + auto mul_2_w_dims = mul_2_w_t->dims(); + auto mul_2_w_len = mul_2_w_t->numel(); + int16_t* mul_2_w_ptr = mul_2_w_t->data(); + if (mul_1_w_dims[0] != mul_2_w_dims[1] || + mul_1_w_dims[1] != mul_2_w_dims[0] || + mul_1_w_len != mul_1_w_dims[0] * mul_1_w_dims[1]) { + LOG(FATAL) << "Error: Dims of excitation mul1 weight is: " << mul_1_w_dims + << ", but get dims of excitation mul2 weight is: " + << mul_2_w_dims; + } + std::vector encode_filter_int16; + encode_filter_int16.resize(mul_1_w_len + mul_2_w_len); + + PADDLE_ENFORCE_EQ(mul_1_w_dims[1] % mul_1_w_dims[0] == 0, + 1, + platform::errors::InvalidArgument( + "Reduction ratio of excitation is not an integer." + "Received mul_1_w_dims[1]: %d, mul_1_w_dims[0]: %d", + mul_1_w_dims[1], + mul_1_w_dims[0])); + fused_op_desc.SetAttr( + "filter_dims", + std::vector{static_cast(mul_1_w_dims[1] / mul_1_w_dims[0]), + static_cast(mul_1_w_dims[1])}); + + paddle::framework::ir::math::Transpose(mul_1_w_ptr, + encode_filter_int16.data(), + mul_1_w_dims[0], + mul_1_w_dims[1]), + paddle::framework::ir::math::Transpose( + mul_2_w_ptr, + encode_filter_int16.data() + mul_1_w_len, + mul_2_w_dims[0], + mul_2_w_dims[1]); + + std::string new_filter_name = "se_" + mul_1_w_name; + Node* new_filter_node = nullptr; + VarDesc dst_desc(new_filter_name); + dst_desc.SetPersistable(true); + dst_desc.SetShape({mul_1_w_len + mul_2_w_len}); + dst_desc.SetDataType(framework::TransToProtoVarType(mul_1_w_t->dtype())); + new_filter_node = graph->CreateVarNode(&dst_desc); + auto* block_dst_desc = block->Var(new_filter_name); + block_dst_desc->SetPersistable(dst_desc.Persistable()); + block_dst_desc->SetShape(dst_desc.GetShape()); + block_dst_desc->SetDataType(dst_desc.GetDataType()); + + phi::DenseTensor new_filter_t; + new_filter_t.Resize(DDim({mul_1_w_len + mul_2_w_len})); + new_filter_t.set_type(phi::DataType::INT16); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + auto* new_filter_data = cpu_ctx->Alloc(&new_filter_t); + + memcpy(new_filter_data, + encode_filter_int16.data(), + (mul_1_w_len + mul_2_w_len) * sizeof(int16_t)); + + Assign(new_filter_t, + scope->Var(new_filter_name)->GetMutable()); + fused_op_desc.SetInput("filter", {new_filter_name}); + + // filter max + std::vector encode_filter_max; + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + int filter_max_size = max_ptr_size + max_ptr_size; + encode_filter_max.resize(filter_max_size); + + auto mul_1_w_max_name = mul_1_w_max->Name(); + auto mul_2_w_max_name = mul_2_w_max->Name(); + auto* mul_1_w_max_t = + scope->FindVar(mul_1_w_max_name)->GetMutable(); + auto* mul_2_w_max_t = + scope->FindVar(mul_2_w_max_name)->GetMutable(); + + float* mul_1_w_max_ptr = mul_1_w_max_t->data(); + float* mul_2_w_max_ptr = mul_2_w_max_t->data(); + memcpy(encode_filter_max.data(), + mul_1_w_max_ptr, + max_ptr_size * sizeof(float)); + memcpy(encode_filter_max.data() + max_ptr_size, + mul_2_w_max_ptr, + max_ptr_size * sizeof(float)); + + std::string new_filter_max_name = new_filter_name + "_max"; + Node* new_filter_max_node = nullptr; + VarDesc filter_max_desc(new_filter_max_name); + filter_max_desc.SetPersistable(true); + filter_max_desc.SetShape({filter_max_size}); + filter_max_desc.SetDataType( + framework::TransToProtoVarType(mul_1_w_max_t->dtype())); + new_filter_max_node = graph->CreateVarNode(&filter_max_desc); + auto* block_filter_max_desc = block->Var(new_filter_max_name); + block_filter_max_desc->SetPersistable(filter_max_desc.Persistable()); + block_filter_max_desc->SetShape(filter_max_desc.GetShape()); + block_filter_max_desc->SetDataType(filter_max_desc.GetDataType()); + + phi::DenseTensor new_filter_max_t; + new_filter_max_t.Resize(DDim({filter_max_size})); + new_filter_max_t.set_type(phi::DataType::FLOAT32); + auto* new_filter_max_data = cpu_ctx->Alloc(&new_filter_max_t); + + memcpy(new_filter_max_data, + encode_filter_max.data(), + (filter_max_size) * sizeof(float)); + + Assign(new_filter_max_t, + scope->Var(new_filter_max_name)->GetMutable()); + + fused_op_desc.SetInput("filter_max", {new_filter_max_name}); + + // bias + std::string new_bias_name = new_filter_name + "_bias"; + VarDesc new_bias_desc(new_bias_name); + new_bias_desc.SetPersistable(true); + new_bias_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + Node* new_bias_node = graph->CreateVarNode(&new_bias_desc); + if (with_bias) { + auto mul_1_bias_name = mul_1_bias->Name(); + auto mul_2_bias_name = mul_2_bias->Name(); + auto* mul_1_bias_t = + scope->FindVar(mul_1_bias_name)->GetMutable(); + auto* mul_2_bias_t = + scope->FindVar(mul_2_bias_name)->GetMutable(); + int mul_1_bias_numel = mul_1_bias_t->numel(); + int mul_2_bias_numel = mul_2_bias_t->numel(); + + std::vector encode_bias; + encode_bias.resize(mul_1_bias_numel + mul_2_bias_numel); + float* mul_1_bias_ptr = mul_1_bias_t->data(); + float* mul_2_bias_ptr = mul_2_bias_t->data(); + + memcpy( + encode_bias.data(), mul_1_bias_ptr, mul_1_bias_numel * sizeof(float)); + memcpy(encode_bias.data() + mul_1_bias_numel, + mul_2_bias_ptr, + mul_2_bias_numel * sizeof(float)); + + new_bias_desc.SetShape({mul_1_bias_numel + mul_2_bias_numel}); + auto* block_new_bias_dst_desc = block->Var(new_bias_name); + block_new_bias_dst_desc->SetPersistable(new_bias_desc.Persistable()); + block_new_bias_dst_desc->SetShape(new_bias_desc.GetShape()); + block_new_bias_dst_desc->SetDataType(new_bias_desc.GetDataType()); + + phi::DenseTensor new_bias_t; + new_bias_t.Resize(DDim({mul_1_bias_numel + mul_2_bias_numel})); + new_bias_t.set_type(phi::DataType::FLOAT32); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + auto* new_bias_data = cpu_ctx->Alloc(&new_bias_t); + + memcpy(new_bias_data, + encode_bias.data(), + (mul_1_bias_numel + mul_2_bias_numel) * sizeof(float)); + Assign(new_bias_t, + scope->Var(new_bias_name)->GetMutable()); + fused_op_desc.SetInput("bias", {new_bias_name}); + } + fused_op_desc.SetAttr("has_bias", with_bias); + fused_op_desc.SetAttr("has_branch", with_branch); + std::string output_name; + if (act_type != "linear") { + output_name = block_act_out->Name(); + } else if (with_branch) { + output_name = ew_branch_add_out->Name(); + } else { + output_name = ew_mul_out->Name(); + } + fused_op_desc.SetOutput("out", {output_name}); + std::string max_output_name = output_name + "_max"; + VarDesc max_out_desc(max_output_name); + auto* max_output_node = graph->CreateVarNode(&max_out_desc); + + fused_op_desc.SetOutput("out_max", {max_output_name}); + fused_op_desc.SetAttr("op_type", std::vector{4}); + fused_op_desc.SetAttr("place_x", std::vector{0}); + fused_op_desc.SetAttr("place_y", std::vector{9}); + fused_op_desc.SetAttr("place_z", std::vector{10}); + fused_op_desc.SetAttr("strides", std::vector{}); + fused_op_desc.SetAttr("paddings", std::vector{}); + fused_op_desc.SetAttr("dilations", std::vector{}); + fused_op_desc.SetAttr("groups", std::vector{}); + fused_op_desc.SetAttr("block_lod", std::vector{1}); + fused_op_desc.SetAttr("conv_bias", std::vector{with_bias}); + + std::map act_map{{"linear", 0}, + {"relu", 1}, + {"sigmoid", 2}, + {"tanh", 3}, + {"leaky_relu", 5}, + {"hard_swish", 14}, + {"hard_sigmoid", 15}, + {"relu6", 17}}; + + float block_act_param_ = 0.f; + if (act_type == "leak_relu") { + block_act_param_ = + PADDLE_GET_CONST(float, block_act->Op()->GetAttr("alpha")); + } else if (act_type == "hard_sigmoid") { + block_act_param_ = + PADDLE_GET_CONST(float, block_act->Op()->GetAttr("slope")); + } + fused_op_desc.SetAttr( + "act_type", + std::vector{ + PADDLE_GET_CONST(int, mul_1->Op()->GetAttr("act_type")), + PADDLE_GET_CONST(int, mul_2->Op()->GetAttr("act_type")), + act_map[act_type]}); + + fused_op_desc.SetAttr( + "act_param", + std::vector{ + PADDLE_GET_CONST(float, mul_1->Op()->GetAttr("act_param")), + PADDLE_GET_CONST(float, mul_2->Op()->GetAttr("act_param")), + block_act_param_}); + + auto* new_op_node = graph->CreateOpNode(&fused_op_desc); + IR_NODE_LINK_TO(x, new_op_node); + if (with_branch) { + IR_NODE_LINK_TO(ew_branch_add_in, new_op_node); + } + IR_NODE_LINK_TO(new_filter_node, new_op_node); + IR_NODE_LINK_TO(new_filter_max_node, new_op_node); + + if (with_bias) { + IR_NODE_LINK_TO(new_bias_node, new_op_node); + } + + if (act_type != "linear") { + IR_NODE_LINK_TO(new_op_node, block_act_out); + } else if (with_branch) { + IR_NODE_LINK_TO(new_op_node, ew_branch_add_out); + } else { + IR_NODE_LINK_TO(new_op_node, ew_mul_out); + } + IR_NODE_LINK_TO(new_op_node, max_output_node); + // delete useless node + std::unordered_set delete_nodes = { + pool2d, mul_1, mul_1_out, mul_2, mul_2_out, ew_mul}; + if (with_bias) { + delete_nodes.insert(mul_1_bias); + delete_nodes.insert(mul_2_bias); + } + if (with_branch) { + delete_nodes.insert(ew_branch_add); + } + GraphSafeRemoveNodes(graph, delete_nodes); + found_subgraph_count++; + }; + + gpd(graph, handler); + return found_subgraph_count; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(squeeze_excitation_fuse_pass, + paddle::framework::ir::SqueezeExcitationFusePass); + +REGISTER_PASS_CAPABILITY(squeeze_excitation_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().EQ( + "squeeze_excitation_block", 0)); diff --git a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h new file mode 100644 index 00000000000..72249f6fdc0 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass.h @@ -0,0 +1,82 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +/* +Squeeze and Excitaion Block Fusion for SE-ResNet +Origin subgraph + Input + | \ + | \ + | \ + | | + | Global Pooling + | | + | conv2d_xpu + | | + | | + | conv2d_xpu + \ | + \ | + elementwise_mul + | + Output +------------------------------------------------------ +After the pass is applied: + + in_Input + in_Filter | in_FilterMax + \ | / + \ | / + in_Branch ------- squeeze_excitation_block ------ in_Bias + | + | + | + out_Output +*/ +class SqueezeExcitationFusePass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + int ApplyImpl(ir::Graph* graph, + const std::string& op_type, + const std::string& act_type, + bool with_branch, + bool with_bias) const; + + const std::string name_scope_{"squeeze_excitation_fuse_pass"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass_test.cc b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass_test.cc new file mode 100644 index 00000000000..9442049220d --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/squeeze_excitation_fuse_pass_test.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(SqueezeExcitationFusePass, V1) { + Layers layers; + auto* block = layers.Block(); + + auto* pool2d_inp = layers.data("pool2d_inp", {1, 24, 14, 14}); + auto* pool2d_out = layers.pool2d(pool2d_inp, false); + + auto* conv2d_xpu_op1_out = layers.data("conv2d_xpu_op1_out"); + OpDesc* conv2d_xpu_op1 = block->AppendOp(); + conv2d_xpu_op1->SetType("conv2d_xpu"); + conv2d_xpu_op1->SetInput("x", {pool2d_out->Name()}); + conv2d_xpu_op1->SetOutput("out", {conv2d_xpu_op1_out->Name()}); + + auto* conv2d_xpu_op2_out = layers.data("conv2d_xpu_op2_out"); + OpDesc* conv2d_xpu_op2 = block->AppendOp(); + conv2d_xpu_op2->SetType("conv2d_xpu"); + conv2d_xpu_op2->SetInput("x", {conv2d_xpu_op1_out->Name()}); + conv2d_xpu_op2->SetOutput("out", {conv2d_xpu_op2_out->Name()}); + + layers.elementwise_mul(pool2d_inp, conv2d_xpu_op2_out); + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + auto pass = PassRegistry::Instance().Get("squeeze_excitation_fuse_pass"); + pass->Apply(graph.get()); + auto num = GetNumOpNodes(graph, "pool2d") + + GetNumOpNodes(graph, "conv2d_xpu") + + GetNumOpNodes(graph, "elementwise_mul"); + PADDLE_ENFORCE_EQ(num, + 0, + platform::errors::PreconditionNotMet( + "pool2d/conv2d_xpu/elementwise_mul ops should be " + "removed from graph, but graph " + "still has %d ops. ", + num)); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(squeeze_excitation_fuse_pass); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 0c5423fe4d9..ba71eff1738 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -547,6 +547,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "fc_xpu_fuse_pass", "conv2d_xpu_fuse_pass", "conv2d_transpose_xpu_fuse_pass", + "squeeze_excitation_fuse_pass", "add_activation_xpu_fuse_pass", "add_layernorm_xpu_fuse_pass", "fast_layernorm_xpu_fuse_pass", diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml index 53da3a65975..ac19987119d 100644 --- a/paddle/phi/api/yaml/fused_ops.yaml +++ b/paddle/phi/api/yaml/fused_ops.yaml @@ -208,6 +208,16 @@ data_type : input optional : bias_qk +- op : squeeze_excitation_block + args : (Tensor x, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] act_type, float[] act_param, int[] filter_dims) + output : Tensor(out) + infer_meta : + func : SqueezeExcitationInferMeta + kernel : + func : squeeze_excitation_block + data_type : x + optional : bias, branch + - op : yolo_box_xpu args : (Tensor x, Tensor x_max, Tensor grid, Tensor stride, Tensor anchor_grid, float offset) output : Tensor(out), Tensor(out_max) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index d52769723e3..41e69df801e 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -1005,6 +1005,7 @@ XPUOpMap& get_kl2_ops() { {"sequence_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"sequence_unpad", XPUKernelSet({phi::DataType::FLOAT32})}, // Fused op + {"squeeze_excitation_block", XPUKernelSet({phi::DataType::FLOAT32})}, {"resnet_basic_block_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"resnet_basic_block", XPUKernelSet({phi::DataType::FLOAT32})}, {"fused_gemm_epilogue", diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 7a10fd752ac..49ec2405051 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -964,4 +964,29 @@ void FusedScaleBiasReluConvBnstatsInferMeta( eq_bias->set_dims(c_dims); } +void SqueezeExcitationInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const MetaTensor& filter_max, + const MetaTensor& bias, + const MetaTensor& branch, + const std::vector& act_type, + const std::vector& act_param, + const std::vector& filter_dims, + MetaTensor* out) { + auto in_dims = x.dims(); + // do some checks + PADDLE_ENFORCE_EQ( + in_dims.size(), + 4, + phi::errors::InvalidArgument( + "The input should be a 4-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), + in_dims)); + std::vector out_shape( + {in_dims[0], filter_dims[1], in_dims[2], in_dims[3]}); + // set output dims + out->set_dims(DDim(out_shape.data(), out_shape.size())); +} + } // namespace phi diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index ee41d55ca55..dd5fcfcbf85 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -234,4 +234,14 @@ void FusedScaleBiasReluConvBnstatsInferMeta( MetaTensor* eq_scale, MetaTensor* eq_bias); +void SqueezeExcitationInferMeta(const MetaTensor& x, + const MetaTensor& filter, + const MetaTensor& filter_max, + const MetaTensor& bias, + const MetaTensor& branch, + const std::vector& act_type, + const std::vector& act_param, + const std::vector& filter_dims, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/fusion/xpu/squeeze_excitation_block_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/squeeze_excitation_block_xpu_kernel.cc new file mode 100644 index 00000000000..c3ded28ecad --- /dev/null +++ b/paddle/phi/kernels/fusion/xpu/squeeze_excitation_block_xpu_kernel.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace fusion { + +template +void SqueezeExcitationKernelImpl(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& filter_max, + const paddle::optional& bias, + const paddle::optional& branch, + const std::vector& act_type, + const std::vector& act_param, + const std::vector& filter_dims, + DenseTensor* out) { + using XPUTypeX = typename XPUTypeTrait::Type; + using XPUTypeW = typename XPUTypeTrait::Type; + + auto* weight1_ptr = filter.data(); + auto weight_len = filter.numel(); + auto weight1_len = weight_len / 2; + auto* weight2_ptr = weight1_ptr + weight1_len; + + auto input_dims = x.dims(); + + int batch = static_cast(input_dims[0]); + int channel = static_cast(input_dims[1]); + int h = static_cast(input_dims[2]); + int w = static_cast(input_dims[3]); + auto* input_data = reinterpret_cast(x.data()); + const XPUTypeX* branch_data = nullptr; + auto* branch_tensor = branch.get_ptr(); + if (branch_tensor != nullptr) { + branch_data = reinterpret_cast(branch_tensor->data()); + } + const float* bias1_ptr = + bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data(); + const float* bias2_ptr = (bias1_ptr != nullptr) + ? (bias1_ptr + filter_dims[1] / filter_dims[0]) + : nullptr; + int max_ptr_size = 6; + const float* w1_maxptr = filter_max.data(); + const float* w2_maxptr = w1_maxptr + max_ptr_size; + auto* out_data = + reinterpret_cast(ctx.template Alloc(out)); + + std::vector act; + for (size_t i = 0; i < 3; i++) { + xpu::Activation_t cur_act = (xpu::Activation_t::act_enum)act_type[i]; + if (act_type[i] == 5) { + cur_act.leaky_alpha = act_param[i]; + } else if (act_type[i] == 15) { + cur_act.hard_sigmoid_slope = act_param[i]; + } + act.push_back(cur_act); + } + int r = xpu::squeeze_excitation_block( + /* baidu::xpu::api::Context* ctx */ ctx.x_context(), + /* const T* x */ input_data, + /* const TW* weight1 */ reinterpret_cast(weight1_ptr), + /* const TW* weight2 */ reinterpret_cast(weight2_ptr), + /* T* y */ out_data, + /* int64_t n x */ batch, + /* int64_t c x */ channel, + /* int64_t h */ h, + /* int64_t w */ w, + /* int64_t r */ filter_dims[0], + /* const float* w1_maxptr */ reinterpret_cast(w1_maxptr), + /* const float* w2_maxptr */ reinterpret_cast(w2_maxptr), + /* const float* bias1 x */ bias1_ptr, + /* const float* bias2 */ bias2_ptr, + /* const T* branch */ branch_data, + /* const Activation_t& excitation_act1 */ act[0], + /* const Activation_t& excitation_act2 */ act[1], + /* const Activation_t& block_act */ act[2]); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "squeeze_excitation_block"); +} + +template +void SqueezeExcitationKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& filter_max, + const paddle::optional& bias, + const paddle::optional& branch, + const std::vector& act_type, + const std::vector& act_param, + const std::vector& filter_dims, + DenseTensor* out) { + SqueezeExcitationKernelImpl(ctx, + x, + filter, + filter_max, + bias, + branch, + act_type, + act_param, + filter_dims, + out); +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(squeeze_excitation_block, + XPU, + ALL_LAYOUT, + phi::fusion::SqueezeExcitationKernel, + float) {} diff --git a/test/ir/inference/test_xpu_squeeze_excitation_fuse_pass.py b/test/ir/inference/test_xpu_squeeze_excitation_fuse_pass.py new file mode 100644 index 00000000000..de32b373078 --- /dev/null +++ b/test/ir/inference/test_xpu_squeeze_excitation_fuse_pass.py @@ -0,0 +1,163 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial + +import hypothesis.strategies as st +import numpy as np +from auto_scan_test import PassAutoScanTest +from program_config import OpConfig, ProgramConfig, TensorConfig + + +class TestSqueezeExcitationFusePass(PassAutoScanTest): + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_xpu=True) + yield config, ["squeeze_excitation_block"], (1e-3, 1e-3) + + def sample_program_config(self, draw): + def generate_data(shape): + return np.random.random(shape).astype(np.float32) + + x_shape = draw( + st.lists( + st.integers(min_value=1, max_value=12), min_size=4, max_size=4 + ) + ) + x_shape[1] = 24 + + oc = 6 + conv2d_op1_w_shape = [oc, x_shape[1], 1, 1] + conv2d_op1_b_shape = [oc] + conv2d_op2_w_shape = [x_shape[1], oc, 1, 1] + conv2d_op2_b_shape = [x_shape[1]] + + # Random choose if add a relu operator + has_relu = draw(st.sampled_from([True, False])) + + pool2d_op = OpConfig( + type="pool2d", + inputs={"X": ["pool2d_x"]}, + outputs={"Out": ["pool2d_out"]}, + adaptive=True, + data_format="NCHW", + global_pooling=False, + ksize=[1, 1], + pooling_type="avg", + ) + ops = [pool2d_op] + + conv2d_op = OpConfig( + "conv2d", + inputs={ + "Input": ["pool2d_out"], + "Filter": ["conv2d_weight"], + }, + outputs={"Output": ["conv2d_out"]}, + data_format="NCHW", + dilations=[1, 1], + padding_algorithm="EXPLICIT", + groups=1, + paddings=[0, 0, 0, 0], + strides=[1, 1], + has_bias=False, + ) + + ew_bias_op = OpConfig( + "elementwise_add", + inputs={"X": ["conv2d_out"], "Y": ["ew_bias"]}, + outputs={"Out": ["add_out"]}, + axis=1, + ) + ops.extend([conv2d_op, ew_bias_op]) + conv2d_input = "add_out" + # 3. activation + if has_relu: + relu_op = OpConfig( + "relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]} + ) + conv2d_input = "relu_out" + ops.append(relu_op) + + conv2d_op2 = OpConfig( + "conv2d", + inputs={ + "Input": [conv2d_input], + "Filter": ["conv2d_weight2"], + }, + outputs={"Output": ["conv2d_out2"]}, + data_format="NCHW", + dilations=[1, 1], + padding_algorithm="EXPLICIT", + groups=1, + paddings=[0, 0, 0, 0], + strides=[1, 1], + has_bias=False, + ) + + ew_bias_op2 = OpConfig( + "elementwise_add", + inputs={"X": ["conv2d_out2"], "Y": ["ew_bias2"]}, + outputs={"Out": ["add_out2"]}, + axis=1, + ) + ops.extend([conv2d_op2, ew_bias_op2]) + ele_mul_input = "add_out2" + # 3. activation + if has_relu: + relu_op2 = OpConfig( + "relu", + inputs={"X": ["add_out2"]}, + outputs={"Out": ["relu_out2"]}, + ) + ele_mul_input = "relu_out2" + ops.append(relu_op2) + + ew_mul_op = OpConfig( + "elementwise_mul", + inputs={"X": ["pool2d_x"], "Y": [ele_mul_input]}, + outputs={"Out": ["ew_mul_out"]}, + axis=-1, + ) + ops.append(ew_mul_op) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": TensorConfig( + data_gen=partial(generate_data, conv2d_op1_w_shape) + ), + "ew_bias": TensorConfig(shape=conv2d_op1_b_shape), + "conv2d_weight2": TensorConfig( + data_gen=partial(generate_data, conv2d_op2_w_shape) + ), + "ew_bias2": TensorConfig(shape=conv2d_op2_b_shape), + }, + inputs={ + "pool2d_x": TensorConfig(shape=x_shape), + }, + outputs=ops[-1].outputs["Out"], + ) + return program_config + + def test(self): + self.run_and_statis( + quant=False, + max_examples=25, + passes=["squeeze_excitation_fuse_pass"], + ) + + +if __name__ == "__main__": + unittest.main() -- GitLab