未验证 提交 eddf1ad6 编写于 作者: W wz1qqx 提交者: GitHub

[XPU]add conv_fuse pass && kernel (#52247)

上级 d8081f22
...@@ -224,6 +224,7 @@ if(WITH_XPU) ...@@ -224,6 +224,7 @@ if(WITH_XPU)
SRCS xpu/pass_utils.cc SRCS xpu/pass_utils.cc
DEPS pass xpu_quant_utils) DEPS pass xpu_quant_utils)
set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils) set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils)
pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
pass_library(embedding_with_eltwise_add_xpu_fuse_pass inference DIR xpu DEPS pass_library(embedding_with_eltwise_add_xpu_fuse_pass inference DIR xpu DEPS
${XPU_PASS_DEPS}) ${XPU_PASS_DEPS})
pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
namespace patterns {
struct Conv2dXPUPattern : public PatternBase {
Conv2dXPUPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& conv_type,
const std::string& act_type,
bool with_conv_bias,
bool with_bn,
bool with_branch_x,
bool with_branch_y);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(ew_bias_add);
PATTERN_DECL_NODE(bn);
PATTERN_DECL_NODE(ew_branch_add);
PATTERN_DECL_NODE(act);
// declare variable node's name
PATTERN_DECL_NODE(input);
PATTERN_DECL_NODE(conv_filter);
PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(ew_bias_add_y);
PATTERN_DECL_NODE(ew_bias_add_out);
PATTERN_DECL_NODE(bn_bias);
PATTERN_DECL_NODE(bn_mean);
PATTERN_DECL_NODE(bn_scale);
PATTERN_DECL_NODE(bn_var);
PATTERN_DECL_NODE(bn_out);
PATTERN_DECL_NODE(bn_var_out);
PATTERN_DECL_NODE(bn_mean_out);
PATTERN_DECL_NODE(bn_saved_var);
PATTERN_DECL_NODE(bn_saved_mean);
PATTERN_DECL_NODE(ew_branch_add_in);
PATTERN_DECL_NODE(ew_branch_add_out);
PATTERN_DECL_NODE(act_out);
private:
std::string conv_type_;
std::string act_type_;
bool with_conv_bias_{false};
bool with_bn_{false};
bool with_branch_{false};
bool with_branch_x_{false};
bool with_branch_y_{false};
};
Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& conv_type,
const std::string& act_type,
bool with_conv_bias,
bool with_bn,
bool with_branch_x,
bool with_branch_y)
: PatternBase(pattern, name_scope, name_scope),
conv_type_(conv_type),
act_type_(act_type),
with_conv_bias_(with_conv_bias),
with_bn_(with_bn),
with_branch_(with_branch_x || with_branch_y),
with_branch_x_(with_branch_x),
with_branch_y_(with_branch_y) {
auto conv = pattern->NewNode(conv_repr())->assert_is_op(conv_type_);
auto input = pattern->NewNode(input_repr())
->assert_is_op_input(conv_type_, "Input")
->AsInput();
auto conv_filter = pattern->NewNode(conv_filter_repr())
->assert_is_op_input(conv_type_, "Filter")
->AsInput();
auto conv_out = pattern->NewNode(conv_out_repr())
->assert_is_op_output(conv_type_, "Output")
->assert_var_not_persistable();
conv->LinksFrom({input, conv_filter}).LinksTo({conv_out});
// ew_bias_add op
PDNode* ew_bias_add = nullptr;
PDNode* ew_bias_add_y = nullptr;
PDNode* ew_bias_add_out = nullptr;
if (with_conv_bias_) {
conv_out->assert_is_op_input("elementwise_add", "X");
ew_bias_add_y = pattern->NewNode(ew_bias_add_y_repr())
->assert_is_op_input("elementwise_add", "Y")
->assert_is_persistable_var()
->assert_has_n_outputs(1);
ew_bias_add =
pattern->NewNode(ew_bias_add_repr())->assert_is_op("elementwise_add");
ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
->assert_is_op_output("elementwise_add", "Out");
ew_bias_add->LinksFrom({conv_out, ew_bias_add_y})
.LinksTo({ew_bias_add_out});
} else {
ew_bias_add_out = conv_out;
}
PDNode* bn = nullptr;
PDNode* bn_bias = nullptr;
PDNode* bn_mean = nullptr;
PDNode* bn_scale = nullptr;
PDNode* bn_var = nullptr;
PDNode* bn_out = nullptr;
PDNode* bn_mean_out = nullptr;
PDNode* bn_saved_mean = nullptr;
PDNode* bn_var_out = nullptr;
PDNode* bn_saved_var = nullptr;
PDNode* ew_branch_add = nullptr;
PDNode* ew_branch_add_in = nullptr;
PDNode* ew_branch_add_out = nullptr;
PDNode* act = nullptr;
PDNode* act_out = nullptr;
// batch_norm op
if (with_bn_) {
ew_bias_add_out->assert_is_op_input("batch_norm", "X");
bn_bias = pattern->NewNode(bn_bias_repr())
->assert_is_op_input("batch_norm", "Bias")
->assert_has_n_outputs(1);
bn_mean = pattern->NewNode(bn_mean_repr())
->assert_is_op_input("batch_norm", "Mean")
->assert_has_n_outputs(1);
bn_scale = pattern->NewNode(bn_scale_repr())
->assert_is_op_input("batch_norm", "Scale")
->assert_has_n_outputs(1);
bn_var = pattern->NewNode(bn_var_repr())
->assert_is_op_input("batch_norm", "Variance")
->assert_has_n_outputs(1);
bn = pattern->NewNode(bn_repr())->assert_is_op("batch_norm");
bn_out =
pattern->NewNode(bn_out_repr())->assert_is_op_output("batch_norm", "Y");
bn_mean_out = pattern->NewNode(bn_mean_out_repr())
->assert_is_op_output("batch_norm", "MeanOut");
bn_saved_mean = pattern->NewNode(bn_saved_mean_repr())
->assert_is_op_output("batch_norm", "SavedMean");
bn_var_out = pattern->NewNode(bn_var_out_repr())
->assert_is_op_output("batch_norm", "VarianceOut");
bn_saved_var = pattern->NewNode(bn_saved_var_repr())
->assert_is_op_output("batch_norm", "SavedVariance");
bn->LinksFrom({ew_bias_add_out, bn_bias, bn_mean, bn_scale, bn_var})
.LinksTo(
{bn_out, bn_mean_out, bn_var_out, bn_saved_mean, bn_saved_var});
} else {
bn_out = ew_bias_add_out;
}
// ew_branch_add op
if (with_branch_) {
if (with_branch_x_) {
bn_out->assert_is_op_input("elementwise_add", "Y")->AsIntermediate();
ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
->assert_is_op_input("elementwise_add", "X")
->AsInput()
->assert_more([](Node* node) {
return node->Var()->GetShape().size() == 4;
});
} else if (with_branch_y_) {
bn_out->assert_is_op_input("elementwise_add", "X")->AsIntermediate();
ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
->assert_is_op_input("elementwise_add", "Y")
->AsInput()
->assert_more([](Node* node) {
return node->Var()->GetShape().size() == 4;
});
}
ew_branch_add =
pattern->NewNode(ew_branch_add_repr())->assert_is_op("elementwise_add");
ew_branch_add_out = pattern->NewNode(ew_branch_add_out_repr())
->assert_is_op_output("elementwise_add", "Out");
ew_branch_add->LinksFrom({bn_out, ew_branch_add_in})
.LinksTo({ew_branch_add_out});
} else {
ew_branch_add_out = bn_out;
}
// act op
if (!act_type_.empty()) {
ew_branch_add_out->assert_is_op_input(act_type_, "X")->AsIntermediate();
act = pattern->NewNode(act_repr())->assert_is_op(act_type_);
act_out = pattern->NewNode(act_out_repr())
->assert_is_op_output(act_type_, "Out")
->assert_var_not_persistable();
act->LinksFrom({ew_branch_add_out}).LinksTo({act_out});
}
}
} // namespace patterns
/*
fuse conv2d block in resnet50-like model to xpu_conv2d op
For example:
graph[1]: sub block
in_Input
|
|
conv2d----in_Filter
|
|
elementwise_add -----conv_Bias
|
|
batch_norm ------in_Bias
|
|
act
|
|
out_Out
------------------------------------------------------
graph[2]: sub block
in_Input
|
|
conv2d----in_Filter
|
|
batch_norm ------in_Bias
|
|
out_Out
------------------------------------------------------
graph[3]: sub block
in_Input
|
|
conv2d----in_Filter
|
|
in_X batch_norm ------in_Bias
\ |
\ |
elementwise_add
|
|
act
|
|
out_Out
------------------------------------------------------
graph[4]: sub block
in_Input
|
|
conv2d----in_Filter
|
|
elementwise_add ------in_Bias
|
|
act
|
|
out_Out
------------------------------------------------------
After the pass is applied:
in_Input
in_Filter | in_FilterMax
\ | /
\ | /
in_Branch ------- __xpu__conv2d ------ in_Bias
| \
| \
| out_OutputMax
out_Output
*/
class Conv2dXPUFusePass : public FusePassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
private:
int ApplyImpl(ir::Graph* graph,
const std::string& conv_type,
const std::string& act_type,
bool with_conv_bias,
bool with_bn,
bool with_branch_x,
bool with_branch_y) const;
const std::string name_scope_{"conv2d_xpu_fuse_pass"};
};
void Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::PreconditionNotMet("graph should not be null."));
Init(name_scope_, graph);
int found_subgraph_count = 0;
for (auto conv_type : {"conv2d", "depthwise_conv2d"}) {
for (auto with_conv_bias : {true, false}) {
for (auto with_bn : {true, false}) {
for (auto with_branch_x : {true, false}) {
for (auto with_branch_y : {true, false}) {
for (auto act_type : {
"relu",
"sigmoid",
"tanh",
"gelu",
"leaky_relu",
"hard_swish",
"hard_sigmoid",
"relu6",
"swish",
"",
}) {
if (with_branch_x && with_branch_y) continue;
found_subgraph_count += ApplyImpl(graph,
conv_type,
act_type,
with_conv_bias,
with_bn,
with_branch_x,
with_branch_y);
}
}
}
}
}
}
AddStatis(found_subgraph_count);
}
int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
const std::string& conv_type,
const std::string& act_type,
bool with_conv_bias,
bool with_bn,
bool with_branch_x,
bool with_branch_y) const {
GraphPatternDetector gpd;
patterns::Conv2dXPUPattern pattern(gpd.mutable_pattern(),
name_scope_,
conv_type,
act_type,
with_conv_bias,
with_bn,
with_branch_x,
with_branch_y);
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "handle Conv2dXPUFusePass fuse";
/* declare operator node's name */
GET_IR_NODE(conv);
GET_IR_NODE(ew_bias_add);
GET_IR_NODE(bn);
GET_IR_NODE(ew_branch_add);
GET_IR_NODE(act);
/* declare variable node's name*/
GET_IR_NODE(input);
GET_IR_NODE(conv_filter);
GET_IR_NODE(conv_out);
GET_IR_NODE(ew_bias_add_y);
GET_IR_NODE(ew_bias_add_out);
GET_IR_NODE(bn_bias);
GET_IR_NODE(bn_mean);
GET_IR_NODE(bn_scale);
GET_IR_NODE(bn_var);
GET_IR_NODE(bn_out);
GET_IR_NODE(bn_var_out);
GET_IR_NODE(bn_mean_out);
GET_IR_NODE(bn_saved_var);
GET_IR_NODE(bn_saved_mean);
GET_IR_NODE(ew_branch_add_in);
GET_IR_NODE(ew_branch_add_out);
GET_IR_NODE(act_out);
auto* block = conv->Op()->Block();
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
// recompute bias and weight for conv2d_xpu op
auto* filter_t =
scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
auto filter_dims = filter_t->dims();
bool has_bias = with_bn || with_conv_bias;
// Create conv_fusion_bias (conv bias) variable
Node* fusion_bias_node = nullptr;
if (has_bias) {
if (ew_bias_add != nullptr) {
auto* ew_bias_add_y_t = scope->FindVar(ew_bias_add_y->Name())
->GetMutable<phi::DenseTensor>();
auto ew_bias_add_y_dims = ew_bias_add_y_t->dims();
PADDLE_ENFORCE_EQ(filter_dims[0],
ew_bias_add_y_dims[0],
platform::errors::InvalidArgument(
"the shape[%d] of elewise bias tensor "
"must equal out_channel[%d] of conv",
ew_bias_add_y_dims[0],
filter_dims[0]));
PrepareBias(graph, scope, block, ew_bias_add_y, &fusion_bias_node);
}
if (bn != nullptr) {
auto bn_bias_t =
scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
PADDLE_ENFORCE_EQ(filter_dims[0],
bn_bias_t->dims()[0],
platform::errors::InvalidArgument(
"the shape[%d] of bn bias tensor "
"must equal out_channel[%d] of conv",
bn_bias_t->dims()[0],
filter_dims[0]));
auto bn_scale_t =
scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
auto bn_mean_t =
scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
auto bn_var_t =
scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
float* filter_ptr =
filter_t->mutable_data<float>(paddle::platform::CPUPlace());
float* bn_scale_ptr =
bn_scale_t->mutable_data<float>(paddle::platform::CPUPlace());
float* bn_bias_ptr =
bn_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
float* bn_mean_ptr =
bn_mean_t->mutable_data<float>(paddle::platform::CPUPlace());
float* bn_var_ptr =
bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
auto mean_len = bn_mean_t->numel();
auto filter_len = filter_t->numel();
auto filter_stride = filter_len / mean_len;
float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
if (fusion_bias_node == nullptr) { // prev node is conv
PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node);
}
auto fusion_bias_t = scope->Var(fusion_bias_node->Name())
->GetMutable<phi::DenseTensor>();
float* fusion_bias_ptr =
fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
// recompute bias and weights
if (ew_bias_add == nullptr) {
for (int i = 0; i < mean_len; ++i) {
bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
fusion_bias_ptr[i] += (0.f - bn_mean_ptr[i]) * bn_scale_ptr[i];
for (int j = 0; j < filter_stride; j++) {
filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
}
}
} else {
for (int i = 0; i < mean_len; ++i) {
bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
bn_bias_ptr[i] +=
(fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i];
for (int j = 0; j < filter_stride; j++) {
filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
}
}
memcpy(fusion_bias_ptr, bn_bias_ptr, mean_len * sizeof(float));
}
}
}
// filter max
Node* filter_int16 = nullptr;
Node* filter_max = nullptr;
PrepareWeight<int16_t>(
graph, scope, block, conv_filter, &filter_int16, &filter_max, false);
// output && output max
std::string conv2d_xpu_out_name;
if (!act_type.empty()) {
conv2d_xpu_out_name = act_out->Name();
} else if (ew_branch_add) {
conv2d_xpu_out_name = ew_branch_add_out->Name();
} else if (bn) {
conv2d_xpu_out_name = bn_out->Name();
} else if (ew_bias_add) {
conv2d_xpu_out_name = ew_bias_add_out->Name();
} else {
conv2d_xpu_out_name = conv_out->Name();
}
std::string conv_out_max_name = conv2d_xpu_out_name + "_max";
VarDesc conv_out_max_desc(conv_out_max_name);
Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv_out_max_desc);
// Generate conv2d_xpu op
framework::OpDesc conv2d_xpu_op_desc(block);
// set input&output var
conv2d_xpu_op_desc.SetType("conv2d_xpu");
conv2d_xpu_op_desc.SetInput("input", {input->Name()});
conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()});
conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()});
conv2d_xpu_op_desc.SetOutput("output", {conv2d_xpu_out_name});
conv2d_xpu_op_desc.SetOutput("output_max", {conv_out_max_name});
// set fusion_bias input node
if (has_bias) {
conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
conv2d_xpu_op_desc.SetAttr("has_bias", has_bias);
}
// set ew_branch_add input node
if (ew_branch_add_in != nullptr) {
conv2d_xpu_op_desc.SetInput("branch", {ew_branch_add_in->Name()});
}
// set attrs of conv2d_xpu
float act_param_ = 0.0f;
if (!act_type.empty()) {
if (act_type == "leaky_relu") {
act_param_ = PADDLE_GET_CONST(float, act->Op()->GetAttr("alpha"));
} else if (act_type == "hard_sigmoid") {
act_param_ = PADDLE_GET_CONST(float, act->Op()->GetAttr("slope"));
}
}
conv2d_xpu_op_desc.SetAttr("act_type", ConvertActivationType(act_type));
conv2d_xpu_op_desc.SetAttr("act_param", act_param_);
std::vector<int> conv_bias;
if (has_bias) {
conv_bias.push_back(1);
} else {
conv_bias.push_back(0);
}
if (conv->Op()->HasAttr("padding_algorithm")) {
conv2d_xpu_op_desc.SetAttr(
"padding_algorithm",
PADDLE_GET_CONST(std::string,
conv->Op()->GetAttr("padding_algorithm")));
}
auto conv_paddings =
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("paddings"));
if (conv_paddings.size() == 2) {
for (int i = 0; i < 2; i++) {
int copy_pad = *(conv_paddings.begin() + 2 * i);
conv_paddings.insert(conv_paddings.begin() + 2 * i + 1, copy_pad);
}
}
PADDLE_ENFORCE_EQ(conv_paddings.size(),
4UL,
platform::errors::InvalidArgument(
"padding length should be 4, but received %d, ",
conv_paddings.size()));
conv2d_xpu_op_desc.SetAttr(
"dilations",
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations")));
conv2d_xpu_op_desc.SetAttr(
"groups", PADDLE_GET_CONST(int, conv->Op()->GetAttr("groups")));
conv2d_xpu_op_desc.SetAttr(
"strides",
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("strides")));
conv2d_xpu_op_desc.SetAttr("conv_bias", conv_bias);
conv2d_xpu_op_desc.SetAttr("op_type", std::vector<int>{0});
conv2d_xpu_op_desc.SetAttr("place_x", std::vector<int>{0});
conv2d_xpu_op_desc.SetAttr("place_y", std::vector<int>{9});
conv2d_xpu_op_desc.SetAttr("place_z", std::vector<int>{10});
conv2d_xpu_op_desc.SetAttr("paddings", conv_paddings);
conv2d_xpu_op_desc.SetAttr("block_lod", std::vector<int>{1});
conv2d_xpu_op_desc.SetAttr("has_branch", with_branch_x || with_branch_y);
auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
IR_NODE_LINK_TO(input, conv2d_xpu);
IR_NODE_LINK_TO(filter_int16, conv2d_xpu);
IR_NODE_LINK_TO(filter_max, conv2d_xpu);
if (ew_bias_add || bn) {
SAFE_IR_NODE_LINK_TO(fusion_bias_node, conv2d_xpu);
}
if (ew_branch_add_in) {
IR_NODE_LINK_TO(ew_branch_add_in, conv2d_xpu);
}
if (act_out) {
IR_NODE_LINK_TO(conv2d_xpu, act_out);
} else if (ew_branch_add_out) {
IR_NODE_LINK_TO(conv2d_xpu, ew_branch_add_out);
} else if (bn_out) {
IR_NODE_LINK_TO(conv2d_xpu, bn_out);
} else if (ew_bias_add_out) {
IR_NODE_LINK_TO(conv2d_xpu, ew_bias_add_out);
} else {
IR_NODE_LINK_TO(conv2d_xpu, conv_out);
}
IR_NODE_LINK_TO(conv2d_xpu, conv2d_xpu_out_max);
// delete useless node
std::unordered_set<const Node*> delete_nodes = {conv};
if (act != nullptr) {
delete_nodes.insert(act);
}
if (ew_branch_add != nullptr) {
delete_nodes.insert(ew_branch_add);
}
if (bn != nullptr) {
delete_nodes.insert(bn);
delete_nodes.insert(bn_bias);
delete_nodes.insert(bn_var);
delete_nodes.insert(bn_mean);
delete_nodes.insert(bn_scale);
delete_nodes.insert(bn_var_out);
delete_nodes.insert(bn_mean_out);
delete_nodes.insert(bn_saved_var);
delete_nodes.insert(bn_saved_mean);
}
if (ew_bias_add) {
delete_nodes.insert(ew_bias_add);
delete_nodes.insert(ew_bias_add_y);
}
GraphSafeRemoveNodes(graph, delete_nodes);
found_subgraph_count++;
};
gpd(graph, handler);
return found_subgraph_count;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv2d_xpu_fuse_pass, paddle::framework::ir::Conv2dXPUFusePass);
REGISTER_PASS_CAPABILITY(conv2d_xpu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"conv2d_xpu", 0));
...@@ -532,6 +532,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { ...@@ -532,6 +532,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
"stack_fuse_pass", "stack_fuse_pass",
"fused_multi_transformer_xpu_quant_pass", "fused_multi_transformer_xpu_quant_pass",
"fc_xpu_fuse_pass", "fc_xpu_fuse_pass",
"conv2d_xpu_fuse_pass",
"link_xpu_op_max_pass", "link_xpu_op_max_pass",
"inplace_op_var_pass", "inplace_op_var_pass",
"delete_isolated_node_pass", "delete_isolated_node_pass",
......
...@@ -4,6 +4,16 @@ ...@@ -4,6 +4,16 @@
# if one operator have "support_dygraph_mode : true", it supports dygraph mode, # if one operator have "support_dygraph_mode : true", it supports dygraph mode,
# otherwise the operator only could be used in static mode. # otherwise the operator only could be used in static mode.
- op : conv2d_xpu
args : (Tensor input, Tensor input_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
output : Tensor(output), Tensor(output_max)
infer_meta :
func : Conv2dXPUInferMeta
kernel :
func : conv2d_xpu
data_type : input
optional : bias, branch, input_max
- op : embedding_with_eltwise_add_xpu - op : embedding_with_eltwise_add_xpu
args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx) args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx)
output: Tensor output: Tensor
......
...@@ -58,6 +58,7 @@ XPUOpMap& get_kl1_ops() { ...@@ -58,6 +58,7 @@ XPUOpMap& get_kl1_ops() {
{"concat_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"concat_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"conv2d", XPUKernelSet({phi::DataType::FLOAT32})}, {"conv2d", XPUKernelSet({phi::DataType::FLOAT32})},
{"conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"conv2d_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
{"deformable_conv", XPUKernelSet({phi::DataType::FLOAT32})}, {"deformable_conv", XPUKernelSet({phi::DataType::FLOAT32})},
{"deformable_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"deformable_conv_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"depthwise_conv2d", XPUKernelSet({phi::DataType::FLOAT32})}, {"depthwise_conv2d", XPUKernelSet({phi::DataType::FLOAT32})},
......
...@@ -151,6 +151,8 @@ XPUOpMap& get_kl2_ops() { ...@@ -151,6 +151,8 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"conv2d", {"conv2d",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"conv2d_xpu",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"conv3d_grad", {"conv3d_grad",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"conv3d", {"conv3d",
......
...@@ -18,9 +18,149 @@ limitations under the License. */ ...@@ -18,9 +18,149 @@ limitations under the License. */
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
namespace phi { namespace phi {
inline int ConvOutSize(int input_size,
int filter_size,
int dilation,
int pad_left,
int pad_right,
int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size =
(input_size + (pad_left + pad_right) - dkernel) / stride + 1;
return output_size;
}
void Conv2dXPUInferMeta(const MetaTensor& input,
const MetaTensor& input_max,
const MetaTensor& filter,
const MetaTensor& filter_max,
const MetaTensor& bias,
const MetaTensor& branch,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::string& padding_algorithm,
int groups,
bool has_bias,
bool has_branch,
int act_type,
float act_param,
MetaTensor* output,
MetaTensor* output_max) {
auto in_dims = input.dims();
auto filter_dims = filter.dims();
// do some checks
PADDLE_ENFORCE_EQ(
in_dims.size(),
4,
phi::errors::InvalidArgument(
"The input of Op(Conv_xpu) should be a 4-D Tensor. But "
"received: input's dimension is %u, input's shape is [%s].",
in_dims.size(),
in_dims));
PADDLE_ENFORCE_EQ(
in_dims.size(),
filter_dims.size(),
phi::errors::InvalidArgument(
"The input's dimension and filter's dimension of "
"Op(Conv_xpu) should be equal. But received: the input's shape is "
"[%s], "
"the input's dimension is %d; the filter's shape is [%s], "
"the filter's dimension is %d.",
in_dims,
in_dims.size(),
filter_dims,
filter_dims.size()));
const auto input_channels = in_dims[1];
int stride_size = strides.size();
int in_sub_stride_size = in_dims.size() - stride_size;
int dilation_size = dilations.size();
PADDLE_ENFORCE_EQ(
in_dims.size(),
strides.size() + 2U,
phi::errors::InvalidArgument(
"The difference of input's dimension and Attr(strides)'s "
"length must be euqal to 2 for Op(Conv_xpu). "
"But received: input's dimension is %d, input's shape is [%s]; "
"Attr(stride)'s length is %d, Attr(stride) is [%s]; "
"difference of input's dimention and Attr(strides)'s length = %u.",
in_dims.size(),
in_dims,
strides.size(),
phi::make_ddim(strides),
in_sub_stride_size));
for (int i = 0; i < dilation_size; ++i) {
PADDLE_ENFORCE_GT(
dilations[i],
0,
phi::errors::InvalidArgument(
"The dilation of Op(Conv) should be larget than 0, but received "
"dilation is %d.",
dilations[i]));
}
PADDLE_ENFORCE_EQ(
input_channels,
filter_dims[1] * groups,
phi::errors::InvalidArgument(
"The number of input's channels should be equal to filter's channels "
"* groups for Op(Conv_xpu). But received: the input's channels is "
"%d, "
"the input's shape is [%s]; the filter's channels is %d, the "
"filter's shape is [%s]; the groups is %d. ",
input_channels,
in_dims,
filter_dims[1],
filter_dims,
groups));
PADDLE_ENFORCE_EQ(
filter_dims[0] % groups,
0,
phi::errors::InvalidArgument(
"The number of output's channels (filter's first dimension) of "
"Op(Conv) should be divided by groups. But received: "
"the output channels is %d, the filter's shape is [%s], "
"the groups is %d.",
filter_dims[0],
filter_dims,
groups));
// update paddings and dilations accoring to padding_algorithm
std::vector<int> paddings_vec = paddings;
std::vector<int> dilations_vec = dilations;
DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(&paddings_vec,
&dilations_vec,
padding_algorithm,
in_data_dims,
strides,
ksize);
std::vector<int64_t> out_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
out_shape.push_back(ConvOutSize(in_dims[i + 2],
filter_dims[i + 2],
dilations[i],
paddings_vec[i * 2],
paddings_vec[i * 2 + 1],
strides[i]));
}
// set output and output max dims
output->set_dims(DDim(out_shape.data(), out_shape.size()));
output_max->set_dims(phi::make_ddim({4}));
}
void EmbeddingWithEltwiseAddXPUInferMeta( void EmbeddingWithEltwiseAddXPUInferMeta(
const std::vector<const MetaTensor*>& ids, const std::vector<const MetaTensor*>& ids,
const std::vector<const MetaTensor*>& tables, const std::vector<const MetaTensor*>& tables,
......
...@@ -22,6 +22,24 @@ namespace phi { ...@@ -22,6 +22,24 @@ namespace phi {
// Common InferMeta Functions for fusion operators. // Common InferMeta Functions for fusion operators.
// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. // NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
void Conv2dXPUInferMeta(const MetaTensor& input,
const MetaTensor& input_max,
const MetaTensor& filter,
const MetaTensor& filter_max,
const MetaTensor& bias,
const MetaTensor& branch,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::string& padding_algorithm,
int groups,
bool has_bias,
bool has_branch,
int act_type,
float act_param,
MetaTensor* output,
MetaTensor* output_max);
void EmbeddingWithEltwiseAddXPUInferMeta( void EmbeddingWithEltwiseAddXPUInferMeta(
const std::vector<const MetaTensor*>& ids, const std::vector<const MetaTensor*>& ids,
const std::vector<const MetaTensor*>& tables, const std::vector<const MetaTensor*>& tables,
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
namespace phi {
namespace fusion {
template <typename T, typename Context>
void Conv2dXPUKernel(const Context& ctx,
const DenseTensor& input,
const paddle::optional<DenseTensor>& input_max,
const DenseTensor& filter,
const DenseTensor& filter_max,
const paddle::optional<DenseTensor>& bias,
const paddle::optional<DenseTensor>& branch,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const std::string& padding_algorithm,
int groups,
bool has_bias,
bool has_branch,
int act_type,
float act_param,
DenseTensor* output,
DenseTensor* output_max) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto input_dims = input.dims();
auto filter_dims = filter.dims();
// update paddings and dilations accoring to padding_algorithm
std::vector<int> paddings_vec = paddings;
std::vector<int> dilations_vec = dilations;
DDim in_data_dims = phi::slice_ddim(input_dims, 2, input_dims.size());
DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(&paddings_vec,
&dilations_vec,
padding_algorithm,
in_data_dims,
strides,
ksize);
int batch = static_cast<int>(input_dims[0]);
int in_c = static_cast<int>(input_dims[1]);
int in_h = static_cast<int>(input_dims[2]);
int in_w = static_cast<int>(input_dims[3]);
int out_c = static_cast<int>(filter_dims[0]);
int win_h = static_cast<int>(filter_dims[2]);
int win_w = static_cast<int>(filter_dims[3]);
auto* input_data = reinterpret_cast<const XPUType*>(input.data<T>());
const float* input_max_data = input_max.get_ptr() == nullptr
? nullptr
: input_max.get_ptr()->data<float>();
auto* branch_data =
branch.get_ptr() == nullptr
? nullptr
: reinterpret_cast<const XPUType*>(branch.get_ptr()->data<T>());
const float* bias_data =
bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(output));
xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
if (act_type == xpu::Activation_t::LEAKY_RELU) {
act.leaky_alpha = act_param;
} else if (act_type == xpu::Activation_t::HARD_SIGMOID) {
act.hard_sigmoid_slope = act_param;
}
int r =
xpu::conv2d_fusion<XPUType, int16_t, XPUType, int16_t>( // TX/TW/TY/TGEMM
/* baidu::xpu::api::Context* ctx */ ctx.x_context(),
/* const TX* input */ input_data,
/* const TW* filter */ filter.data<int16_t>(),
/* TY* output */ out_data,
/* int64_t n */ batch,
/* int64_t ic */ in_c,
/* int64_t h */ in_h,
/* int64_t w */ in_w,
/* int64_t oc */ out_c,
/* const std::vector<int>& ksize */ std::vector<int>{win_h, win_w},
/* const std::vector<int>& strides */ strides,
/* const std::vector<int>& paddings */ paddings_vec,
/* const std::vector<int>& dilations */ dilations_vec,
/* int64_t groups */ groups,
/* const float* in_maxptr */ input_max_data,
/* const float* filter_maxptr */ filter_max.data<float>(),
/* float* out_maxptr */ ctx.template Alloc<float>(output_max),
/* bool is_nchw */ true,
/* const float* bias */ bias_data,
/* const TY* branch */ branch_data,
/* const baidu::xpu::api::Activation_t& act */ act,
/* const float* branch_maxptr */ nullptr);
// /* const float* scale */ nullptr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu");
}
} // namespace fusion
} // namespace phi
PD_REGISTER_KERNEL(conv2d_xpu,
XPU,
ALL_LAYOUT,
phi::fusion::Conv2dXPUKernel,
float,
phi::dtype::float16) {}
...@@ -45,9 +45,9 @@ void FcXPUKernel(const Context& ctx, ...@@ -45,9 +45,9 @@ void FcXPUKernel(const Context& ctx,
bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>(); bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out)); auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type)); xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
if (act_type == 5) { if (act_type == xpu::Activation_t::LEAKY_RELU) {
act.leaky_alpha = act_alpha; act.leaky_alpha = act_alpha;
} else if (act_type == 15) { } else if (act_type == xpu::Activation_t::HARD_SIGMOID) {
act.hard_sigmoid_slope = act_alpha; act.hard_sigmoid_slope = act_alpha;
} }
int r = int r =
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from functools import partial
import hypothesis.strategies as st
import numpy as np
from auto_scan_test import PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
class TestConv2dXPUFusePass(PassAutoScanTest):
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_xpu=True)
yield config, ["conv2d_xpu"], (1e-3, 1e-3)
def is_program_valid(self, prog_config):
paddings = prog_config.ops[0].attrs["paddings"]
strides = prog_config.ops[0].attrs["strides"]
groups = prog_config.ops[0].attrs["groups"]
padding_algorithm = prog_config.ops[0].attrs["padding_algorithm"]
dilations = prog_config.ops[0].attrs["dilations"]
data_format = prog_config.ops[0].attrs["data_format"]
filter_shape = prog_config.weights["conv2d_weight"].shape
input_shape = prog_config.inputs["conv2d_input"].shape
if data_format != "NCHW":
return False
if padding_algorithm == "VALID":
if (
(input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1))
/ strides[0]
+ 1
) <= 1 or (
(input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1))
/ strides[1]
+ 1
) <= 1:
return False
if padding_algorithm == "EXPLICIT":
if (
(
input_shape[2]
+ paddings[0]
+ paddings[1]
- (dilations[0] * (filter_shape[2] - 1) + 1)
)
/ strides[0]
+ 1
) <= 1 or (
(
input_shape[3]
+ paddings[2]
+ paddings[3]
- (dilations[1] * (filter_shape[3] - 1) + 1)
)
/ strides[1]
+ 1
) <= 1:
return False
if data_format == "NCHW":
if input_shape[1] != filter_shape[1] * groups:
return False
if filter_shape[0] % groups != 0:
return False
return True
def sample_program_config(self, draw):
data_format = draw(st.sampled_from(["NCHW"]))
x_shape = draw(
st.lists(
st.integers(min_value=12, max_value=12), min_size=4, max_size=4
)
)
x_shape[1] = draw(st.integers(min_value=1, max_value=10))
# 3. Generate legal shape of input:Y of conv2d
w_shape = draw(
st.lists(
st.integers(min_value=3, max_value=3), min_size=4, max_size=4
)
)
if data_format == "NCHW":
w_shape[1] = x_shape[1]
padding_algorithm = draw(st.sampled_from(["SAME", "VALID"]))
groups = draw(st.integers(min_value=1, max_value=1))
dilations = draw(
st.lists(
st.integers(min_value=1, max_value=1), min_size=2, max_size=2
)
)
paddings = draw(
st.lists(
st.integers(min_value=1, max_value=1), min_size=2, max_size=2
)
)
strides = draw(
st.lists(
st.integers(min_value=1, max_value=1), min_size=2, max_size=2
)
)
axis = 1
ew_bias_shape = [w_shape[0]]
# Random choose if add a relu operator
has_relu = True
def generate_data(shape):
return np.random.random(shape).astype(np.float32)
# Here we will compose a program
# Still has some risks that the program is invalid or cause bug while running
# Use function `is_program_valid` to filter the invalid programs before running
# Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing
conv2d_op = OpConfig(
"conv2d",
inputs={
"Input": ["conv2d_input"],
"Filter": ["conv2d_weight"],
},
outputs={"Output": ["conv2d_out"]},
data_format=data_format,
dilations=dilations,
padding_algorithm=padding_algorithm,
groups=groups,
paddings=paddings,
strides=strides,
has_bias=False,
)
ew_bias_op = OpConfig(
"elementwise_add",
inputs={"X": ["conv2d_out"], "Y": ["ew_bias"]},
outputs={"Out": ["add_out"]},
axis=axis,
)
ops = [conv2d_op, ew_bias_op]
# 3. activation
if has_relu:
relu_op = OpConfig(
"relu", inputs={"X": ["add_out"]}, outputs={"Out": ["relu_out"]}
)
ops.append(relu_op)
program_config = ProgramConfig(
ops=ops,
inputs={
"conv2d_input": TensorConfig(
data_gen=partial(generate_data, x_shape)
),
},
weights={
"conv2d_weight": TensorConfig(
data_gen=partial(generate_data, w_shape)
),
"ew_bias": TensorConfig(
data_gen=partial(generate_data, ew_bias_shape)
),
},
outputs=ops[-1].outputs["Out"],
)
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=25,
passes=["conv2d_xpu_fuse_pass"],
)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册