未验证 提交 12c15b89 编写于 作者: M mjp9527 提交者: GitHub

[XPU] Add conv2d transpose fuse pass (#54904)

上级 25726b94
...@@ -240,6 +240,8 @@ if(WITH_XPU) ...@@ -240,6 +240,8 @@ if(WITH_XPU)
pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu
DEPS ${XPU_PASS_DEPS}) DEPS ${XPU_PASS_DEPS})
pass_library(conv2d_transpose_xpu_fuse_pass inference DIR xpu DEPS
${XPU_PASS_DEPS})
pass_library(embedding_with_eltwise_add_xpu_fuse_pass inference DIR xpu DEPS pass_library(embedding_with_eltwise_add_xpu_fuse_pass inference DIR xpu DEPS
${XPU_PASS_DEPS}) ${XPU_PASS_DEPS})
pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
......
...@@ -188,8 +188,10 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const { ...@@ -188,8 +188,10 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const {
"c_softmax_with_cross_entropy", "c_softmax_with_cross_entropy",
"cross_entropy", "cross_entropy",
"cross_entropy2", "cross_entropy2",
#ifndef PADDLE_WITH_XPU
// slower than fp32 // slower than fp32
"conv2d_transpose", "conv2d_transpose",
#endif
// default fp32 can avoid return inf when the sum value large than 65504 // default fp32 can avoid return inf when the sum value large than 65504
"reduce_sum", "reduce_sum",
}); });
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
namespace patterns {
struct Conv2dTransposeXPUPattern : public PatternBase {
Conv2dTransposeXPUPattern(PDPattern* pattern,
const std::string& name_scope,
const std::string& act_type,
bool with_ew_bias,
bool with_bn);
// operator
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(ew_bias_add);
PATTERN_DECL_NODE(bn);
PATTERN_DECL_NODE(act);
// conv param
PATTERN_DECL_NODE(input);
PATTERN_DECL_NODE(conv_filter);
PATTERN_DECL_NODE(conv_out);
// ew param
PATTERN_DECL_NODE(ew_bias_add_y);
PATTERN_DECL_NODE(ew_bias_add_out);
// bn param
PATTERN_DECL_NODE(bn_bias);
PATTERN_DECL_NODE(bn_mean);
PATTERN_DECL_NODE(bn_scale);
PATTERN_DECL_NODE(bn_var);
PATTERN_DECL_NODE(bn_out);
PATTERN_DECL_NODE(bn_var_out);
PATTERN_DECL_NODE(bn_mean_out);
PATTERN_DECL_NODE(bn_saved_var);
PATTERN_DECL_NODE(bn_saved_mean);
// act param
PATTERN_DECL_NODE(act_out);
private:
std::string act_type_;
bool with_bn_;
bool with_ew_bias_;
};
Conv2dTransposeXPUPattern::Conv2dTransposeXPUPattern(
PDPattern* pattern,
const std::string& name_scope,
const std::string& act_type,
bool with_ew_bias,
bool with_bn)
: PatternBase(pattern, name_scope, name_scope),
act_type_(act_type),
with_bn_(with_bn),
with_ew_bias_(with_ew_bias) {
// deconv op
auto conv = pattern->NewNode(conv_repr())->assert_is_op("conv2d_transpose");
auto input = pattern->NewNode(input_repr())
->assert_is_op_input("conv2d_transpose", "Input")
->AsInput()
->assert_more([](Node* node) {
return node->Var()->GetShape().size() == 4;
});
auto conv_filter = pattern->NewNode(conv_filter_repr())
->assert_is_op_input("conv2d_transpose", "Filter")
->AsInput();
auto conv_out = pattern->NewNode(conv_out_repr())
->assert_is_op_output("conv2d_transpose", "Output")
->assert_has_n_outputs(1);
conv->LinksFrom({input, conv_filter}).LinksTo({conv_out});
// elementwise op
PDNode* ew_bias_add = nullptr;
PDNode* ew_bias_add_y = nullptr;
PDNode* ew_bias_add_out = nullptr;
if (with_ew_bias_) {
conv_out->assert_is_op_input("elementwise_add", "X");
ew_bias_add_y = pattern->NewNode(ew_bias_add_y_repr())
->assert_is_op_input("elementwise_add", "Y")
->assert_is_persistable_var()
->assert_has_n_outputs(1)
->assert_more([](Node* node) {
return node->Var()->GetShape().size() == 1;
});
ew_bias_add =
pattern->NewNode(ew_bias_add_repr())->assert_is_op("elementwise_add");
ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
->assert_is_op_output("elementwise_add", "Out");
if (with_bn_ || !act_type_.empty()) {
ew_bias_add_out->assert_has_n_outputs(1);
}
ew_bias_add->LinksFrom({conv_out, ew_bias_add_y})
.LinksTo({ew_bias_add_out});
} else {
ew_bias_add_out = conv_out;
}
// batch_norm op
PDNode* bn = nullptr;
PDNode* bn_bias = nullptr;
PDNode* bn_mean = nullptr;
PDNode* bn_scale = nullptr;
PDNode* bn_var = nullptr;
PDNode* bn_out = nullptr;
PDNode* bn_mean_out = nullptr;
PDNode* bn_saved_mean = nullptr;
PDNode* bn_var_out = nullptr;
PDNode* bn_saved_var = nullptr;
if (with_bn_) {
ew_bias_add_out->assert_is_op_input("batch_norm", "X");
bn_bias = pattern->NewNode(bn_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("batch_norm", "Bias")
->assert_has_n_outputs(1);
bn_mean = pattern->NewNode(bn_mean_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("batch_norm", "Mean")
->assert_has_n_outputs(1);
bn_scale = pattern->NewNode(bn_scale_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("batch_norm", "Scale")
->assert_has_n_outputs(1);
bn_var = pattern->NewNode(bn_var_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("batch_norm", "Variance")
->assert_has_n_outputs(1);
bn = pattern->NewNode(bn_repr())->assert_is_op("batch_norm");
bn_out =
pattern->NewNode(bn_out_repr())->assert_is_op_output("batch_norm", "Y");
if (!act_type_.empty()) {
bn_out->assert_has_n_outputs(1);
}
bn_mean_out = pattern->NewNode(bn_mean_out_repr())
->assert_is_op_output("batch_norm", "MeanOut");
bn_saved_mean = pattern->NewNode(bn_saved_mean_repr())
->assert_is_op_output("batch_norm", "SavedMean");
bn_var_out = pattern->NewNode(bn_var_out_repr())
->assert_is_op_output("batch_norm", "VarianceOut");
bn_saved_var = pattern->NewNode(bn_saved_var_repr())
->assert_is_op_output("batch_norm", "SavedVariance");
bn->LinksFrom({ew_bias_add_out, bn_bias, bn_mean, bn_scale, bn_var})
.LinksTo(
{bn_out, bn_mean_out, bn_var_out, bn_saved_mean, bn_saved_var});
} else {
bn_out = ew_bias_add_out;
}
// act
PDNode* act = nullptr;
PDNode* act_out = nullptr;
if (!act_type_.empty()) {
bn_out->assert_is_op_input(act_type_, "X");
act = pattern->NewNode(act_repr())->assert_is_op(act_type_);
act_out =
pattern->NewNode(act_out_repr())->assert_is_op_output(act_type_, "Out");
act->LinksFrom({bn_out}).LinksTo({act_out});
} else {
act_out = bn_out;
}
act_out->AsOutput();
}
} // namespace patterns
/* fuse conv2d block in resnet50-like model to xpu_conv2d op */
/* For example: */
/* graph[1]: sub block */
/* in_Input */
/* | */
/* | */
/* conv2d_transpose----in_Filter */
/* | */
/* | */
/* elementwise_add -----ew_add */
/* | */
/* | */
/* batch_norm ------in_Bias */
/* | */
/* | */
/* act */
/* | */
/* | */
/* out_Out */
/* */
class Conv2dTransposeXPUFusePass : public FusePassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
private:
int ApplyImpl(ir::Graph* graph,
const std::string& act_type,
bool with_ew_bias,
bool with_bn) const;
const std::string name_scope_{"conv2d_transpose_xpu_fuse_pass"};
};
void Conv2dTransposeXPUFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::PreconditionNotMet("graph should not be null."));
Init(name_scope_, graph);
int found_subgraph_count = 0;
for (auto with_bn : {true, false}) {
for (auto with_ew_bias : {true, false}) {
for (auto act_type : {"relu", ""}) {
found_subgraph_count +=
ApplyImpl(graph, act_type, with_ew_bias, with_bn);
}
}
}
AddStatis(found_subgraph_count);
}
int Conv2dTransposeXPUFusePass::ApplyImpl(ir::Graph* graph,
const std::string& act_type,
bool with_ew_bias,
bool with_bn) const {
GraphPatternDetector gpd;
patterns::Conv2dTransposeXPUPattern pattern(
gpd.mutable_pattern(), name_scope_, act_type, with_ew_bias, with_bn);
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "handle Conv2dTransposeXPUFusePass fuse";
/* declare operator node's name */
GET_IR_NODE(conv);
GET_IR_NODE(ew_bias_add);
GET_IR_NODE(bn);
GET_IR_NODE(act);
/* declare variable node's name*/
GET_IR_NODE(input);
GET_IR_NODE(conv_filter);
GET_IR_NODE(conv_out);
GET_IR_NODE(ew_bias_add_y);
GET_IR_NODE(ew_bias_add_out);
GET_IR_NODE(bn_bias);
GET_IR_NODE(bn_mean);
GET_IR_NODE(bn_scale);
GET_IR_NODE(bn_var);
GET_IR_NODE(bn_out);
GET_IR_NODE(bn_var_out);
GET_IR_NODE(bn_mean_out);
GET_IR_NODE(bn_saved_var);
GET_IR_NODE(bn_saved_mean);
GET_IR_NODE(act_out);
auto* block = conv->Op()->Block();
auto* scope = param_scope();
PADDLE_ENFORCE_NOT_NULL(
scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
// recompute bias and weight for conv2d_transpose_xpu op
auto* filter_t =
scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
// conv_filter fp16 --> fp32
auto tensor_type = filter_t->dtype();
if (tensor_type == phi::DataType::FLOAT16) {
CastToFp32(filter_t, nullptr);
}
auto filter_dims = filter_t->dims();
bool has_bias = with_bn || with_ew_bias;
Node* fusion_bias_node = nullptr;
int groups = PADDLE_GET_CONST(int, conv->Op()->GetAttr("groups"));
int out_c = filter_dims[1] * groups;
// ew bias
if (with_ew_bias) {
auto* ew_bias_add_y_t =
scope->FindVar(ew_bias_add_y->Name())->GetMutable<phi::DenseTensor>();
auto ew_bias_add_y_dims = ew_bias_add_y_t->dims();
PADDLE_ENFORCE_EQ(out_c,
ew_bias_add_y_dims[0],
platform::errors::InvalidArgument(
"the shape[%d] of elewise bias tensor "
"must equal out_channel[%d] of conv",
ew_bias_add_y_dims[0],
out_c));
PrepareBias(graph, scope, block, ew_bias_add_y, &fusion_bias_node);
}
// bn
if (with_bn) {
auto bn_bias_t =
scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
PADDLE_ENFORCE_EQ(out_c,
bn_bias_t->dims()[0],
platform::errors::InvalidArgument(
"the shape[%d] of bn bias tensor "
"must equal out_channel[%d] of conv",
bn_bias_t->dims()[0],
out_c));
auto bn_scale_t =
scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
auto bn_mean_t =
scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
auto bn_var_t =
scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
float* filter_ptr = filter_t->data<float>();
float* bn_scale_ptr = bn_scale_t->data<float>();
float* bn_bias_ptr = bn_bias_t->data<float>();
float* bn_mean_ptr = bn_mean_t->data<float>();
float* bn_var_ptr = bn_var_t->data<float>();
auto mean_len = bn_mean_t->numel(); // oc
float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
// bias
if (fusion_bias_node) {
auto fusion_bias_t = scope->Var(fusion_bias_node->Name())
->GetMutable<phi::DenseTensor>();
float* fusion_bias_ptr = fusion_bias_t->data<float>();
for (int i = 0; i < mean_len; ++i) {
bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
fusion_bias_ptr[i] =
bn_bias_ptr[i] +
(fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i];
}
} else {
PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node);
auto fusion_bias_t = scope->Var(fusion_bias_node->Name())
->GetMutable<phi::DenseTensor>();
float* fusion_bias_ptr = fusion_bias_t->data<float>();
for (int i = 0; i < mean_len; ++i) {
bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i];
}
}
// compute new conv_weight, weight is ic-oc/g-h-w
int cout_group = filter_dims[1];
int cin_group = filter_dims[0] / groups;
int c_size = cout_group * filter_dims[2] * filter_dims[3];
int hw = filter_dims[2] * filter_dims[3];
for (int g = 0; g < groups; g++) {
for (int k = 0; k < cin_group; ++k) {
for (int i = 0; i < cout_group; ++i) {
auto ptr_row =
filter_ptr + g * cin_group * c_size + k * c_size + i * hw;
for (int j = 0; j < hw; ++j) {
ptr_row[j] *= bn_scale_ptr[g * cout_group + i];
}
}
}
}
}
// filter max
Node* filter_int16 = nullptr;
Node* filter_max = nullptr;
PrepareWeight<int16_t>(
graph, scope, block, conv_filter, &filter_int16, &filter_max, false);
// output && output max
std::string conv2d_xpu_out_name;
if (!act_type.empty()) {
conv2d_xpu_out_name = act_out->Name();
} else if (with_bn) {
conv2d_xpu_out_name = bn_out->Name();
} else if (with_ew_bias) {
conv2d_xpu_out_name = ew_bias_add_out->Name();
} else {
conv2d_xpu_out_name = conv_out->Name();
}
std::string conv_out_max_name = conv2d_xpu_out_name + "_max";
VarDesc conv_out_max_desc(conv_out_max_name);
Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv_out_max_desc);
// Generate conv2d_xpu op
framework::OpDesc conv2d_xpu_op_desc(block);
// set input&output var
conv2d_xpu_op_desc.SetType("conv2d_transpose_xpu");
conv2d_xpu_op_desc.SetInput("x", {input->Name()});
conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()});
conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()});
conv2d_xpu_op_desc.SetOutput("out", {conv2d_xpu_out_name});
conv2d_xpu_op_desc.SetOutput("out_max", {conv_out_max_name});
// set fusion_bias input node
if (has_bias) {
conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
}
conv2d_xpu_op_desc.SetAttr("has_bias", has_bias);
// set attrs of conv2d_xpu
if (!act_type.empty()) {
conv2d_xpu_op_desc.SetAttr("with_act", true);
} else {
conv2d_xpu_op_desc.SetAttr("with_act", false);
}
conv2d_xpu_op_desc.SetAttr("act_type", act_type);
conv2d_xpu_op_desc.SetAttr(
"padding_algorithm",
conv->Op()->GetAttrIfExists<std::string>("padding_algorithm"));
conv2d_xpu_op_desc.SetAttr(
"output_size",
conv->Op()->GetAttrIfExists<std::vector<int>>("output_size"));
conv2d_xpu_op_desc.SetAttr(
"output_padding",
conv->Op()->GetAttrIfExists<std::vector<int>>("output_padding"));
conv2d_xpu_op_desc.SetAttr(
"dilations",
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations")));
conv2d_xpu_op_desc.SetAttr(
"paddings",
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("paddings")));
conv2d_xpu_op_desc.SetAttr(
"groups", PADDLE_GET_CONST(int, conv->Op()->GetAttr("groups")));
conv2d_xpu_op_desc.SetAttr(
"strides",
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("strides")));
conv2d_xpu_op_desc.SetAttr(
"data_format", conv->Op()->GetAttrIfExists<std::string>("data_format"));
auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
IR_NODE_LINK_TO(input, conv2d_xpu);
IR_NODE_LINK_TO(filter_int16, conv2d_xpu);
IR_NODE_LINK_TO(filter_max, conv2d_xpu);
if (has_bias) {
SAFE_IR_NODE_LINK_TO(fusion_bias_node, conv2d_xpu);
}
if (act_out) {
IR_NODE_LINK_TO(conv2d_xpu, act_out);
} else if (bn_out) {
IR_NODE_LINK_TO(conv2d_xpu, bn_out);
} else if (ew_bias_add_out) {
IR_NODE_LINK_TO(conv2d_xpu, ew_bias_add_out);
} else {
IR_NODE_LINK_TO(conv2d_xpu, conv_out);
}
IR_NODE_LINK_TO(conv2d_xpu, conv2d_xpu_out_max);
// delete useless node
std::unordered_set<const Node*> delete_nodes = {conv};
if (act != nullptr) {
delete_nodes.insert(act);
}
if (bn != nullptr) {
delete_nodes.insert(bn);
delete_nodes.insert(bn_bias);
delete_nodes.insert(bn_var);
delete_nodes.insert(bn_mean);
delete_nodes.insert(bn_scale);
delete_nodes.insert(bn_var_out);
delete_nodes.insert(bn_mean_out);
delete_nodes.insert(bn_saved_var);
delete_nodes.insert(bn_saved_mean);
}
if (ew_bias_add) {
delete_nodes.insert(ew_bias_add);
delete_nodes.insert(ew_bias_add_y);
}
GraphSafeRemoveNodes(graph, delete_nodes);
found_subgraph_count++;
};
gpd(graph, handler);
return found_subgraph_count;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv2d_transpose_xpu_fuse_pass,
paddle::framework::ir::Conv2dTransposeXPUFusePass);
REGISTER_PASS_CAPABILITY(conv2d_transpose_xpu_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"conv2d_transpose_xpu", 0));
...@@ -540,6 +540,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { ...@@ -540,6 +540,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
"redundant_squeeze_unsqueeze_elimination_pass", "redundant_squeeze_unsqueeze_elimination_pass",
"fc_xpu_fuse_pass", "fc_xpu_fuse_pass",
"conv2d_xpu_fuse_pass", "conv2d_xpu_fuse_pass",
"conv2d_transpose_xpu_fuse_pass",
"add_activation_xpu_fuse_pass", "add_activation_xpu_fuse_pass",
"yolo_box_xpu_fuse_pass", "yolo_box_xpu_fuse_pass",
"link_xpu_op_max_pass", "link_xpu_op_max_pass",
......
...@@ -14,6 +14,16 @@ ...@@ -14,6 +14,16 @@
data_type : x data_type : x
optional : x_max, y_max optional : x_max, y_max
- op : conv2d_transpose_xpu
args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format, bool has_bias, bool with_act, str act_type)
output : Tensor(out), Tensor(out_max)
infer_meta :
func : Conv2dTransposeXPUInferMeta
kernel :
func : conv2d_transpose_xpu
data_type : x
optional : bias, x_max
- op : conv2d_xpu - op : conv2d_xpu
args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param) args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
output : Tensor(out), Tensor(out_max) output : Tensor(out), Tensor(out_max)
......
...@@ -171,6 +171,8 @@ XPUOpMap& get_kl2_ops() { ...@@ -171,6 +171,8 @@ XPUOpMap& get_kl2_ops() {
{"conv2d_transpose_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"conv2d_transpose_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"conv2d_transpose", {"conv2d_transpose",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"conv2d_transpose_xpu",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"cumsum", {"cumsum",
XPUKernelSet({phi::DataType::FLOAT32, XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16, phi::DataType::FLOAT16,
......
...@@ -518,4 +518,172 @@ void YoloBoxXPUInferMeta(const MetaTensor& x, ...@@ -518,4 +518,172 @@ void YoloBoxXPUInferMeta(const MetaTensor& x,
out_max->set_layout(x.layout()); out_max->set_layout(x.layout());
} }
void ConvTransposeXPUInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const std::vector<int>& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
MetaTensor* out,
MetaTensor* out_max) {
auto x_dims = x.dims();
auto filter_dims = filter.dims();
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
PADDLE_ENFORCE_EQ(
x_dims.size() == 4,
true,
errors::InvalidArgument("Input of Op(conv_transpose) should be 4-D "
"Tensor. But received: %u-D Tensor, "
"the shape of input is [%s]",
x_dims.size(),
x_dims));
PADDLE_ENFORCE_EQ(
x_dims.size(),
filter_dims.size(),
errors::InvalidArgument(
"The input's dimension size and filter's dimension size of "
"Op (conv_transpose) should be equal. But received: the shape of "
"input is [%s], the dimension size of input is [%d], the shape "
"of filter is [%s], the dimension size of filter is [%d]. ",
x_dims,
x_dims.size(),
filter_dims,
filter_dims.size()));
int stride_size = strides.size();
for (int i = 0; i < stride_size; ++i) {
PADDLE_ENFORCE_GT(
strides[i],
0,
errors::InvalidArgument(
"The stride of Op(Conv) should be larget than 0, but received "
"stride is %d.",
strides[i]));
}
int in_sub_stride_size = x_dims.size() - stride_size;
PADDLE_ENFORCE_EQ(
x_dims.size() - strides.size(),
2U,
errors::InvalidArgument(
"The input's dimension size minus Attr(stride)'s size must "
"be euqal to 2 for Op(conv_transpose). But received: [%d], the "
"input's dimension size is [%d], the shape of input "
"is [%s], the Attr(stride)'s size is [%d].",
in_sub_stride_size,
x_dims.size(),
x_dims,
strides.size()));
if (output_size.size())
PADDLE_ENFORCE_EQ(
output_size.size(),
strides.size(),
errors::InvalidArgument(
"The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
if (output_padding.size())
PADDLE_ENFORCE_EQ(
output_padding.size(),
strides.size(),
errors::InvalidArgument(
"The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
const int64_t C =
(data_format != "NHWC" ? x_dims[1] : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(
C,
filter_dims[0],
errors::InvalidArgument(
"The number of input channels should be equal to filter channels "
"for Op(conv_transpose). But received: the input's channels is "
"[%d], the shape of input is [%s], the filter's channels is [%d], "
"the shape of filter is [%s]. The data_format is %s."
"The error may come from wrong data_format setting.",
C,
x_dims,
filter_dims[0],
filter_dims,
data_format));
DDim x_data_dims;
if (data_format != "NHWC") {
x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
} else {
x_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
}
DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
std::vector<int64_t> output_shape({x_dims[0]});
if (data_format != "NHWC") {
output_shape.push_back(filter_dims[1] * groups);
}
const int offset = (data_format != "NHWC" ? 2 : 1);
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations_[i] * (filter_dims[i + 2] - 1) + 1;
auto infer_shape = (x_dims[i + offset] > 0)
? (x_dims[i + offset] - 1) * strides[i] -
paddings_[2 * i] - paddings_[2 * i + 1] +
filter_extent
: -1;
if (output_size.size()) {
output_shape.push_back(output_size[i]);
} else if (output_padding.size()) {
output_shape.push_back((infer_shape + output_padding[i]));
} else {
output_shape.push_back(infer_shape);
}
}
if (data_format == "NHWC") {
output_shape.push_back(filter_dims[1] * groups);
}
out->set_dims(make_ddim(output_shape));
out->set_dtype(x.dtype());
out_max->set_dims(phi::make_ddim({6}));
}
void Conv2dTransposeXPUInferMeta(const MetaTensor& x,
const MetaTensor& x_max,
const MetaTensor& filter,
const MetaTensor& filter_max,
const MetaTensor& bias,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const IntArray& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool has_bias,
bool with_act,
const std::string& act_type,
MetaTensor* out,
MetaTensor* out_max) {
std::vector<int32_t> vec_output_size(output_size.GetData().begin(),
output_size.GetData().end());
ConvTransposeXPUInferMeta(x,
filter,
strides,
paddings,
output_padding,
vec_output_size,
padding_algorithm,
groups,
dilations,
data_format,
out,
out_max);
}
} // namespace phi } // namespace phi
...@@ -145,4 +145,22 @@ void YoloBoxXPUInferMeta(const MetaTensor& x, ...@@ -145,4 +145,22 @@ void YoloBoxXPUInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaTensor* out_max); MetaTensor* out_max);
void Conv2dTransposeXPUInferMeta(const MetaTensor& x,
const MetaTensor& x_max,
const MetaTensor& filter,
const MetaTensor& filter_max,
const MetaTensor& bias,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const IntArray& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool has_bias,
bool with_act,
const std::string& act_type,
MetaTensor* out,
MetaTensor* out_max);
} // namespace phi } // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
namespace phi {
namespace fusion {
template <typename T, typename Context>
void Conv2dTransposeXPUKernel(const Context& ctx,
const DenseTensor& x,
const paddle::optional<DenseTensor>& x_max,
const DenseTensor& filter,
const DenseTensor& filter_max,
const paddle::optional<DenseTensor>& bias,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const IntArray& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
bool has_bias,
bool with_act,
const std::string& act_type,
DenseTensor* out,
DenseTensor* out_max) {
using XPUT = typename XPUTypeTrait<T>::Type;
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
DenseTensor filter_ = filter;
ctx.template Alloc<T>(out);
ctx.template Alloc<float>(out_max);
bool is_nchw;
is_nchw = (data_format == "NHWC") ? false : true;
DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size()); // hw
DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
UpdatePaddingAndDilation(
&paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
const int batch_size = static_cast<int>(x.dims()[0]);
const int img_yc = static_cast<int>(x.dims()[1]);
const int img_xc = static_cast<int>(out->dims()[1]);
const int img_xh = static_cast<int>(out->dims()[2]);
const int img_xw = static_cast<int>(out->dims()[3]);
auto act = xpu::Activation_t::LINEAR;
if (with_act) {
if (act_type == "relu") {
act = xpu::Activation_t::RELU;
}
}
auto bias_data =
bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
auto x_max_data =
x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data<float>();
auto filter_max_data = filter_max.data<float>();
int r = xpu::conv2d_transpose_fusion_v2<XPUT, int16_t, XPUT, int16_t>(
ctx.x_context(),
reinterpret_cast<const XPUT*>(x.data<T>()),
filter_.data<int16_t>(),
reinterpret_cast<XPUT*>(out->data<T>()),
batch_size,
img_yc,
img_xh,
img_xw,
img_xc,
ksize,
strides,
paddings_,
dilations_,
groups,
x_max_data,
filter_max_data,
out_max->data<float>(),
bias_data,
act,
is_nchw);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_fusion_v2");
}
} // namespace fusion
} // namespace phi
PD_REGISTER_KERNEL(conv2d_transpose_xpu,
XPU,
ALL_LAYOUT,
phi::fusion::Conv2dTransposeXPUKernel,
float,
phi::dtype::float16) {}
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from functools import partial
import hypothesis.strategies as st
import numpy as np
from auto_scan_test import PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
class TestConvTransposeXPUFusePass(PassAutoScanTest):
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_xpu=True)
yield config, ["conv2d_transpose_xpu"], (3e-3, 3e-3)
def sample_program_config(self, draw):
x_shape = draw(
st.lists(
st.integers(min_value=4, max_value=16), min_size=4, max_size=4
)
)
oc = draw(st.integers(min_value=2, max_value=16))
weight_shape = [x_shape[1], oc, 4, 4]
y_shape = [oc]
has_bn = draw(st.booleans())
has_add = draw(st.booleans())
has_relu = draw(st.booleans())
def generate_data(shape):
return 0.1 * np.random.random(shape).astype(np.float32)
deconv_op = OpConfig(
"conv2d_transpose",
inputs={"Input": ["input_x"], "Filter": ["weight_x"]},
outputs={"Output": ["output_x"]},
data_format="NCHW",
dilations=[1, 1],
groups=1,
paddings=[0, 0],
padding_algorithm="EXPLICIT",
strides=[4, 4],
fuse_relu=False,
)
input_name_op = "output_x"
ops = [deconv_op]
if has_add:
add_op = OpConfig(
"elementwise_add",
inputs={"X": [input_name_op], "Y": ["bias"]},
outputs={"Out": ["add_out"]},
axis=1,
)
input_name_op = "add_out"
ops.append(add_op)
if has_bn:
bn_op = OpConfig(
"batch_norm",
inputs={
"X": [input_name_op],
"Bias": ["bn_bias"],
"Mean": ["bn_mean"],
"Scale": ["bn_scale"],
"Variance": ["bn_var"],
},
outputs={
"Y": ["bn_y"],
"MeanOut": ["bn_mean"],
"SavedMean": ["bn_mean_save"],
"SavedVariance": ["bn_save_var"],
"VarianceOut": ["bn_var"],
},
data_layout="NCHW",
epsilon=0.000009999999747378752,
momentum=0.89999,
is_test=True,
use_global_stats=True,
)
input_name_op = "bn_y"
ops.append(bn_op)
if has_relu:
relu_op = OpConfig(
"relu",
inputs={"X": [input_name_op]},
outputs={"Out": ["relu_out"]},
)
input_name_op = "relu_out"
ops.append(relu_op)
program_config = ProgramConfig(
ops=ops,
weights={
"weight_x": TensorConfig(
data_gen=partial(generate_data, weight_shape)
),
"bias": TensorConfig(data_gen=partial(generate_data, y_shape)),
"bn_bias": TensorConfig(
data_gen=partial(generate_data, y_shape)
),
"bn_mean": TensorConfig(
data_gen=partial(generate_data, y_shape)
),
"bn_scale": TensorConfig(
data_gen=partial(generate_data, y_shape)
),
"bn_var": TensorConfig(
data_gen=partial(generate_data, y_shape)
),
},
inputs={
"input_x": TensorConfig(
data_gen=partial(generate_data, x_shape)
),
},
outputs=[input_name_op],
)
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=100,
passes=["conv2d_transpose_xpu_fuse_pass"],
)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册