未验证 提交 0a2dfa38 编写于 作者: Z zyfncg 提交者: GitHub

Clear extra input (Bias, ResidualData) in OpMaker of conv2d (#47579)

* delete Bias and ResidualData in OpMaker of conv2d

* delete extra input of conv3d

* refactor pass of conv_bias_fusion

* fix mkldnn dependency

* fix mkldnn compile

* fix test_conv_bias_mkldnn_fuse_pass

* police some code

* remove useless log

* fix analyzer_vit_ocr_tester

* fix conv_activation_mkldnn_fuse_pass

* fix test_analyzer_ocr

* add fused_conv_sig

* fix performence regression

* fix performance regression
上级 888631b6
...@@ -713,7 +713,7 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes, ...@@ -713,7 +713,7 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes,
UpdateControlOpSkipEagerDeletionVars(*n, graph, graph_idx, n->Name()); UpdateControlOpSkipEagerDeletionVars(*n, graph, graph_idx, n->Name());
} }
ops->emplace_back(*n->Op()); ops->emplace_back(*n->Op());
VLOG(4) << n->ToString(); VLOG(5) << n->ToString();
} }
// delete no OpDesc op // delete no OpDesc op
} }
......
...@@ -2068,8 +2068,9 @@ PDNode *patterns::Flatten2Matmul::operator()() { ...@@ -2068,8 +2068,9 @@ PDNode *patterns::Flatten2Matmul::operator()() {
return matmul_out; return matmul_out;
} }
PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { PDNode *patterns::ConvResidual::operator()(const std::string &conv_type,
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); bool with_residual_data) {
auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op(conv_type);
if (!with_residual_data) { if (!with_residual_data) {
conv_op->assert_more([&](Node *x) { conv_op->assert_more([&](Node *x) {
...@@ -2082,22 +2083,22 @@ PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { ...@@ -2082,22 +2083,22 @@ PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
auto input_var = pattern->NewNode(conv_input_repr()) auto input_var = pattern->NewNode(conv_input_repr())
->AsInput() ->AsInput()
->assert_is_op_input("conv2d", "Input"); ->assert_is_op_input(conv_type, "Input");
auto filter_var = pattern->NewNode(conv_filter_repr()) auto filter_var = pattern->NewNode(conv_filter_repr())
->AsInput() ->AsInput()
->assert_is_op_input("conv2d", "Filter"); ->assert_is_op_input(conv_type, "Filter");
auto output_var = pattern->NewNode(conv_output_repr()) auto output_var = pattern->NewNode(conv_output_repr())
->AsOutput() ->AsOutput()
->assert_is_op_output("conv2d", "Output"); ->assert_is_op_output(conv_type, "Output");
std::vector<PDNode *> links_from{input_var, filter_var}; std::vector<PDNode *> links_from{input_var, filter_var};
if (with_residual_data) { if (with_residual_data) {
auto res_conn_var = pattern->NewNode(conv_residual_data_repr()) auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
->AsInput() ->AsInput()
->assert_is_op_input("conv2d", "ResidualData"); ->assert_is_op_input(conv_type, "ResidualData");
links_from.push_back(res_conn_var); links_from.push_back(res_conn_var);
} }
......
...@@ -1057,7 +1057,7 @@ struct ConvResidual : public PatternBase { ...@@ -1057,7 +1057,7 @@ struct ConvResidual : public PatternBase {
ConvResidual(PDPattern* pattern, const std::string& name_scope) ConvResidual(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_residual") {} : PatternBase(pattern, name_scope, "conv_residual") {}
PDNode* operator()(bool with_residual_data); PDNode* operator()(const std::string& conv_type, bool with_residual_data);
PATTERN_DECL_NODE(conv_op); PATTERN_DECL_NODE(conv_op);
PATTERN_DECL_NODE(conv_input); PATTERN_DECL_NODE(conv_input);
......
...@@ -319,7 +319,7 @@ void ComputePropagateScalesMkldnnPass::ComputeWeightScales( ...@@ -319,7 +319,7 @@ void ComputePropagateScalesMkldnnPass::ComputeWeightScales(
ir::Graph* graph, Scope* scope, StringPairMap* var_quant_scales) const { ir::Graph* graph, Scope* scope, StringPairMap* var_quant_scales) const {
ComputeVarScales(graph, ComputeVarScales(graph,
scope, scope,
{"conv2d", "depthwise_conv2d"}, {"conv2d", "depthwise_conv2d", "fused_conv2d"},
"Filter", "Filter",
1, 1,
var_quant_scales); var_quant_scales);
...@@ -446,7 +446,7 @@ void ComputePropagateScalesMkldnnPass::UpdateReluOutputScales( ...@@ -446,7 +446,7 @@ void ComputePropagateScalesMkldnnPass::UpdateReluOutputScales(
if (op->Type() == "relu") { if (op->Type() == "relu") {
is_unsigned = true; is_unsigned = true;
} else { } else {
if (op->Type() == "conv2d") { if (op->Type() == "conv2d" || op->Type() == "fused_conv2d") {
act_name = "fuse_activation"; act_name = "fuse_activation";
output_name = "Output"; output_name = "Output";
} else if (op->Type() == "fc") { } else if (op->Type() == "fc") {
......
...@@ -26,7 +26,7 @@ using string::PrettyLogDetail; ...@@ -26,7 +26,7 @@ using string::PrettyLogDetail;
void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const {
auto act_types = phi::funcs::GetSupportedActivations(); auto act_types = phi::funcs::GetSupportedActivations();
std::vector<std::string> conv_types = {"conv2d"}; std::vector<std::string> conv_types = {"conv2d", "fused_conv2d"};
for (auto& act_type : act_types) { for (auto& act_type : act_types) {
FuseConvConcatAct(graph, act_type); FuseConvConcatAct(graph, act_type);
...@@ -218,6 +218,45 @@ ConvActivationMkldnnFusePass::ConvActivationMkldnnFusePass() { ...@@ -218,6 +218,45 @@ ConvActivationMkldnnFusePass::ConvActivationMkldnnFusePass() {
.IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End(); .End();
AddOpCompat(OpCompat("fused_conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsOptional()
.IsTensor()
.End()
.AddInput("ResidualData")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsOptional()
.IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End();
AddOpCompat(OpCompat("concat")) AddOpCompat(OpCompat("concat"))
.AddInput("X") .AddInput("X")
.End() .End()
......
...@@ -61,6 +61,40 @@ ConvBiasFusePass::ConvBiasFusePass() { ...@@ -61,6 +61,40 @@ ConvBiasFusePass::ConvBiasFusePass() {
.IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End(); .End();
AddOpCompat(OpCompat("fused_conv2d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add")) AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X") .AddInput("X")
.IsTensor() .IsTensor()
...@@ -165,6 +199,40 @@ Conv3DBiasFusePass::Conv3DBiasFusePass() { ...@@ -165,6 +199,40 @@ Conv3DBiasFusePass::Conv3DBiasFusePass() {
.IsStringIn({"NDHWC", "NCDHW"}) .IsStringIn({"NDHWC", "NCDHW"})
.End(); .End();
AddOpCompat(OpCompat("fused_conv3d"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("Filter")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.IsOptional()
.End()
.AddOutput("Output")
.IsTensor()
.End()
.AddAttr("strides")
.IsType<std::vector<int>>()
.End()
.AddAttr("paddings")
.IsType<std::vector<int>>()
.End()
.AddAttr("padding_algorithm")
.IsOptional()
.IsStringIn({"EXPLICIT", "SAME", "VALID"})
.End()
.AddAttr("groups")
.IsNumGE(1)
.End()
.AddAttr("dilations")
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End();
AddOpCompat(OpCompat("elementwise_add")) AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X") .AddInput("X")
.IsTensor() .IsTensor()
...@@ -203,6 +271,16 @@ phi::DenseTensor tensor_apply_eltwise(const phi::DenseTensor& vec_a, ...@@ -203,6 +271,16 @@ phi::DenseTensor tensor_apply_eltwise(const phi::DenseTensor& vec_a,
} }
void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
FuseConvBias(graph, type(), fused_type());
if (type() != fused_type()) {
// Is the second pass useful?
FuseConvBias(graph, fused_type(), fused_type());
}
}
void ConvBiasFusePass::FuseConvBias(ir::Graph* graph,
const std::string& conv_type,
const std::string& fused_conv) const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
FusePassBase::Init(name_scope_, graph); FusePassBase::Init(name_scope_, graph);
...@@ -216,9 +294,9 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -216,9 +294,9 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
gpd.mutable_pattern() gpd.mutable_pattern()
->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
->AsInput() ->AsInput()
->assert_is_op_input(type(), "Input"); ->assert_is_op_input(conv_type, "Input");
patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_); patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
conv_bias_pattern(conv_input, type()); conv_bias_pattern(conv_input, conv_type);
int found_conv_bias_count = 0; int found_conv_bias_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
...@@ -249,7 +327,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -249,7 +327,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
// check if fuse can be done and if MKL-DNN should be used // check if fuse can be done and if MKL-DNN should be used
FuseOptions fuse_option = FindFuseOption(*conv, *eltwise); FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) { if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
VLOG(3) << "do not perform " + type() + "+bias fuse"; VLOG(3) << "do not perform " + conv_type + "+bias fuse";
return; return;
} }
...@@ -294,7 +372,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -294,7 +372,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()})); desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()})); desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()})); desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
desc.SetType(type()); desc.SetType(fused_conv);
for (auto& attr : conv->Op()->GetAttrMap()) { for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second); desc.SetAttr(attr.first, attr.second);
...@@ -323,6 +401,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -323,6 +401,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
type()); type());
} }
} }
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -32,11 +32,17 @@ class ConvBiasFusePass : public FusePassBase { ...@@ -32,11 +32,17 @@ class ConvBiasFusePass : public FusePassBase {
ConvBiasFusePass(); ConvBiasFusePass();
virtual ~ConvBiasFusePass() {} virtual ~ConvBiasFusePass() {}
virtual std::string type() const { return "conv2d"; } virtual std::string type() const { return "conv2d"; }
virtual std::string fused_type() const { return "fused_conv2d"; }
protected: protected:
void ApplyImpl(ir::Graph* graph) const override; void ApplyImpl(ir::Graph* graph) const override;
void FuseConvBias(ir::Graph* graph,
const std::string& conv_type,
const std::string& fused_conv) const;
const std::string name_scope_{"conv_bias_mkldnn_fuse"}; const std::string name_scope_{"conv_bias_mkldnn_fuse"};
}; };
/* /*
* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. * Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
*/ */
...@@ -44,12 +50,14 @@ class Conv2DTransposeBiasFusePass : public ConvBiasFusePass { ...@@ -44,12 +50,14 @@ class Conv2DTransposeBiasFusePass : public ConvBiasFusePass {
public: public:
Conv2DTransposeBiasFusePass(); Conv2DTransposeBiasFusePass();
std::string type() const override { return "conv2d_transpose"; } std::string type() const override { return "conv2d_transpose"; }
std::string fused_type() const override { return "conv2d_transpose"; }
}; };
class Conv3DBiasFusePass : public ConvBiasFusePass { class Conv3DBiasFusePass : public ConvBiasFusePass {
public: public:
Conv3DBiasFusePass(); Conv3DBiasFusePass();
std::string type() const override { return "conv3d"; } std::string type() const override { return "conv3d"; }
std::string fused_type() const override { return "fused_conv3d"; }
}; };
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
......
...@@ -139,7 +139,8 @@ void MainTest(bool convWithExistingBias) { ...@@ -139,7 +139,8 @@ void MainTest(bool convWithExistingBias) {
int conv_bias_count = 0; int conv_bias_count = 0;
for (auto* node : graph->Nodes()) { for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") { if (node->IsOp() && (node->Op()->Type() == "conv2d" ||
node->Op()->Type() == "fused_conv2d")) {
auto* op = node->Op(); auto* op = node->Op();
ASSERT_TRUE(op->HasAttr("use_mkldnn")); ASSERT_TRUE(op->HasAttr("use_mkldnn"));
EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn"))); EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
......
...@@ -388,11 +388,12 @@ void CPUQuantizePass::GetQuantInfo(Graph* graph) const { ...@@ -388,11 +388,12 @@ void CPUQuantizePass::GetQuantInfo(Graph* graph) const {
} }
void CPUQuantizePass::QuantizeConv(Graph* graph, void CPUQuantizePass::QuantizeConv(Graph* graph,
const std::string& conv_type,
bool with_residual_data) const { bool with_residual_data) const {
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern(); auto pattern = gpd.mutable_pattern();
patterns::ConvResidual conv_pattern{pattern, name_scope_}; patterns::ConvResidual conv_pattern{pattern, name_scope_};
conv_pattern(with_residual_data); conv_pattern(conv_type, with_residual_data);
int quantize_conv_count = 0; int quantize_conv_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
...@@ -510,7 +511,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ...@@ -510,7 +511,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
AddStatis(quantize_conv_count); AddStatis(quantize_conv_count);
LogQuantizedOpsCounter( LogQuantizedOpsCounter(
"conv2d", conv_type,
quantize_conv_count, quantize_conv_count,
((with_residual_data) ? "with residual connection" : "")); ((with_residual_data) ? "with residual connection" : ""));
} }
...@@ -1247,8 +1248,10 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { ...@@ -1247,8 +1248,10 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
platform::errors::InvalidArgument("Scope cannot be nullptr.")); platform::errors::InvalidArgument("Scope cannot be nullptr."));
GetQuantInfo(graph); GetQuantInfo(graph);
QuantizeConv(graph, false /* with_residual_data */); QuantizeConv(graph, "conv2d", false /* with_residual_data */);
QuantizeConv(graph, true /* with_residual_data */); QuantizeConv(graph, "conv2d", true /* with_residual_data */);
QuantizeConv(graph, "fused_conv2d", false /* with_residual_data */);
QuantizeConv(graph, "fused_conv2d", true /* with_residual_data */);
QuantizePool(graph); QuantizePool(graph);
QuantizeConcat(graph); QuantizeConcat(graph);
QuantizePriorBox(graph); QuantizePriorBox(graph);
......
...@@ -49,7 +49,9 @@ class CPUQuantizePass : public FusePassBase { ...@@ -49,7 +49,9 @@ class CPUQuantizePass : public FusePassBase {
protected: protected:
void ApplyImpl(ir::Graph* graph) const override; void ApplyImpl(ir::Graph* graph) const override;
void QuantizeConv(Graph* graph, bool with_residual_data) const; void QuantizeConv(Graph* graph,
const std::string& conv_type,
bool with_residual_data) const;
void QuantizeFc(Graph* graph, bool with_residual_data) const; void QuantizeFc(Graph* graph, bool with_residual_data) const;
void QuantizePool(Graph* graph) const; void QuantizePool(Graph* graph) const;
void QuantizeConcat(Graph* graph) const; void QuantizeConcat(Graph* graph) const;
......
...@@ -25,25 +25,14 @@ class Graph; ...@@ -25,25 +25,14 @@ class Graph;
void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Marks operators which are to be quantized."; VLOG(3) << "Marks operators which are to be quantized.";
std::unordered_set<std::string> supported_op_types = std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>({"concat", std::unordered_set<std::string>(
"conv2d", {"concat", "conv2d", "depthwise_conv2d",
"depthwise_conv2d", "fused_conv2d", "fused_conv3d", "elementwise_add",
"elementwise_add", "elementwise_mul", "elementwise_sub", "fc",
"elementwise_mul", "matmul", "nearest_interp", "nearest_interp_v2",
"elementwise_sub", "pool2d", "prior_box", "reshape2",
"fc", "transpose2", "fusion_gru", "fusion_lstm",
"matmul", "multi_gru", "slice", "split"});
"nearest_interp",
"nearest_interp_v2",
"pool2d",
"prior_box",
"reshape2",
"transpose2",
"fusion_gru",
"fusion_lstm",
"multi_gru",
"slice",
"split"});
const auto& excluded_ids_list = const auto& excluded_ids_list =
Get<std::unordered_set<int>>("quantize_excluded_op_ids"); Get<std::unordered_set<int>>("quantize_excluded_op_ids");
const auto& op_types_list = const auto& op_types_list =
...@@ -71,7 +60,6 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { ...@@ -71,7 +60,6 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern); GET_IR_NODE_FROM_SUBGRAPH(op, op, quantize_placement_pattern);
if (std::find(excluded_ids_list.begin(), if (std::find(excluded_ids_list.begin(),
excluded_ids_list.end(), excluded_ids_list.end(),
op->id()) != excluded_ids_list.end()) { op->id()) != excluded_ids_list.end()) {
...@@ -81,7 +69,6 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { ...@@ -81,7 +69,6 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
if (op->Op()->GetAttrIfExists<int>("skip_quant") == 1) { if (op->Op()->GetAttrIfExists<int>("skip_quant") == 1) {
return; return;
} }
op->Op()->SetAttr("mkldnn_data_type", std::string("int8")); op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
}; };
gpd(graph, handler); gpd(graph, handler);
......
...@@ -120,7 +120,7 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph, ...@@ -120,7 +120,7 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph,
bool with_residual_data) const { bool with_residual_data) const {
GraphPatternDetector gpd; GraphPatternDetector gpd;
patterns::ConvResidual conv_pattern(gpd.mutable_pattern(), name_scope_); patterns::ConvResidual conv_pattern(gpd.mutable_pattern(), name_scope_);
conv_pattern(with_residual_data); conv_pattern("conv2d", with_residual_data);
int params_to_int8_conv_found = 0; int params_to_int8_conv_found = 0;
......
...@@ -146,7 +146,7 @@ class Pass { ...@@ -146,7 +146,7 @@ class Pass {
} }
attrs_[attr_name] = attr; attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() { attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name; VLOG(8) << "deleting " << attr_name;
delete attr; delete attr;
}; };
} }
......
...@@ -979,7 +979,7 @@ struct SetAttrDescVisitor { ...@@ -979,7 +979,7 @@ struct SetAttrDescVisitor {
}; };
void OpDesc::Flush() { void OpDesc::Flush() {
VLOG(4) << "Flush " VLOG(8) << "Flush "
<< " " << Type() << " " << need_update_; << " " << Type() << " " << need_update_;
if (need_update_) { if (need_update_) {
this->desc_.mutable_inputs()->Clear(); this->desc_.mutable_inputs()->Clear();
......
...@@ -26,6 +26,12 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { ...@@ -26,6 +26,12 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL; rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
rules_["conv2d"]["Output"] = ScaleAlgo::KL; rules_["conv2d"]["Output"] = ScaleAlgo::KL;
rules_["fused_conv2d"]["Input"] = ScaleAlgo::KL;
rules_["fused_conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
rules_["fused_conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale
rules_["fused_conv2d"]["ResidualData"] = ScaleAlgo::KL;
rules_["fused_conv2d"]["Output"] = ScaleAlgo::KL;
rules_["pool2d"]["X"] = ScaleAlgo::KL; rules_["pool2d"]["X"] = ScaleAlgo::KL;
rules_["pool2d"]["Out"] = ScaleAlgo::KL; rules_["pool2d"]["Out"] = ScaleAlgo::KL;
......
...@@ -1172,6 +1172,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1172,6 +1172,7 @@ struct PD_INFER_DECL AnalysisConfig {
"concat", "concat",
"conv2d", "conv2d",
"depthwise_conv2d", "depthwise_conv2d",
"fused_conv2d",
"elementwise_add", "elementwise_add",
"elementwise_mul", "elementwise_mul",
"fc", "fc",
......
...@@ -103,7 +103,7 @@ TEST(Analyzer_vit_ocr, fuse_status) { ...@@ -103,7 +103,7 @@ TEST(Analyzer_vit_ocr, fuse_status) {
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops); static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33); CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
CHECK_EQ(fuse_statis.at("conv2d_gelu_mkldnn_fuse_pass"), 2); CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16); CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16);
} }
#endif #endif
......
type: "fused_conv2d"
def {
inputs {
name: "Input"
}
inputs {
name: "Filter"
}
inputs {
name: "Bias"
}
inputs {
name: "ResidualData"
}
outputs {
name: "Output"
}
attrs {
name: "strides"
type: INTS
}
attrs {
name: "paddings"
type: INTS
}
attrs {
name: "padding_algorithm"
type: STRING
}
attrs {
name: "groups"
type: INT
}
attrs {
name: "dilations"
type: INTS
}
attrs {
name: "data_format"
type: STRING
}
attrs {
name: "fuse_activation"
type: STRING
}
attrs {
name: "fuse_residual_connection"
type: BOOLEAN
}
attrs {
name: "force_fp32_output"
type: BOOLEAN
}
}
type: "fused_conv3d"
def {
inputs {
name: "Input"
}
inputs {
name: "Filter"
}
inputs {
name: "Bias"
}
inputs {
name: "ResidualData"
}
outputs {
name: "Output"
}
attrs {
name: "strides"
type: INTS
}
attrs {
name: "paddings"
type: INTS
}
attrs {
name: "padding_algorithm"
type: STRING
}
attrs {
name: "groups"
type: INT
}
attrs {
name: "dilations"
type: INTS
}
attrs {
name: "data_format"
type: STRING
}
attrs {
name: "fuse_activation"
type: STRING
}
attrs {
name: "fuse_residual_connection"
type: BOOLEAN
}
attrs {
name: "force_fp32_output"
type: BOOLEAN
}
}
...@@ -364,12 +364,6 @@ void Conv3DOpMaker::Make() { ...@@ -364,12 +364,6 @@ void Conv3DOpMaker::Make() {
"is the width of the filter." "is the width of the filter."
"If the groups attribute is greater than 1, C equals the number of " "If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups."); "input image channels divided by the groups.");
AddInput("ResidualData",
"(Tensor) Tensor with residual data "
"to which convolution output will be added."
"Used with fuse_residual_connection fusion.")
.AsDispensable()
.AsExtra();
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution operator." "(Tensor) The output tensor of convolution operator."
"It has same data fromat and data type as the Input."); "It has same data fromat and data type as the Input.");
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/operators/conv_op.h"
namespace paddle {
namespace operators {
class FusedConvOpMaker : public Conv2DOpMaker {
protected:
void Apply() override {
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "int8", "bfloat16"});
AddAttr<std::string>("fuse_activation",
"(string, default \"\") Only used in mkldnn kernel")
.SetDefault("");
AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual "
"connection.")
.SetDefault(false);
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddAttr<bool>("use_mkldnn", "(bool, default false) Used in mkldnn kernel")
.SetDefault(true);
AddComment(R"DOC(
Convolution Operator.
The convolution operation calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input) and Output(Output) are in NCHW or NHWC format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature.
Filters(Input) is MCHW format format. Where M is the number of output image channels, C is
the number of input image channels, H is the height of the filter, and W
is the width of the filter.
Parameters(strides, paddings, dilations) are two elements. These two elements represent
height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output:
?
Output shape: $(N, C_{out}, H_{out}, W_{out})$
Where
$$
H_{out}= \frac{(H_{in} + pad_height_top + pad_height_bottom - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
W_{out}= \frac{(W_{in} + pad_width_left + pad_width_right - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
// fused_conv2d is only used for onednn inference.
REGISTER_OPERATOR(
fused_conv2d,
ops::ConvOp,
ops::FusedConvOpMaker,
ops::ConvOpInferVarType,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
// fused_conv3d is only used for onednn inference.
REGISTER_OPERATOR(
fused_conv3d,
ops::ConvOp,
ops::FusedConvOpMaker,
ops::ConvOpInferVarType,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
...@@ -1197,7 +1197,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1197,7 +1197,6 @@ All parameter, weight, gradient are variables in Paddle.
-> const paddle::framework::AttributeMap & { -> const paddle::framework::AttributeMap & {
return operators::ExtraInfoUtils::Instance().GetExtraAttrsMap(op_type); return operators::ExtraInfoUtils::Instance().GetExtraAttrsMap(op_type);
}); });
m.def( m.def(
"get_attrtibute_type", "get_attrtibute_type",
[](const std::string &op_type, [](const std::string &op_type,
......
...@@ -118,6 +118,7 @@ if(WITH_MKLDNN) ...@@ -118,6 +118,7 @@ if(WITH_MKLDNN)
"strings/cpu/*.cc" "strings/cpu/*.cc"
"onednn/*.cc" "onednn/*.cc"
"fusion/*.cc" "fusion/*.cc"
"fusion/onednn/*.cc"
"fusion/cpu/*.cc") "fusion/cpu/*.cc")
else() else()
file( file(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/onednn/conv_function.h"
namespace phi {
template <typename T, typename Context>
void FusedConv2DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const paddle::optional<DenseTensor>& bias,
const paddle::optional<DenseTensor>& residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
const std::string& mkldnn_data_type,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* out) {
bool is_BFLOAT16 = mkldnn_data_type == "bfloat16";
ConvOnednn<T>(dev_ctx,
&input,
&filter,
bias.get_ptr(),
residual_param.get_ptr(),
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
true,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
}
template <typename T, typename Context>
void FusedDepthwiseConv2DKernel(
const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const paddle::optional<DenseTensor>& bias,
const paddle::optional<DenseTensor>& residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
const std::string& mkldnn_data_type,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* out) {
bool is_BFLOAT16 = mkldnn_data_type == "bfloat16";
ConvOnednn<T>(dev_ctx,
&input,
&filter,
bias.get_ptr(),
residual_param.get_ptr(),
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
true,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
}
template <typename T, typename Context>
void FusedConv3DKernel(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& filter,
const paddle::optional<DenseTensor>& bias,
const paddle::optional<DenseTensor>& residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
const std::string& mkldnn_data_type,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* out) {
bool is_BFLOAT16 = mkldnn_data_type == "bfloat16";
ConvOnednn<T>(dev_ctx,
&input,
&filter,
bias.get_ptr(),
residual_param.get_ptr(),
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
true,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
}
} // namespace phi
PD_REGISTER_KERNEL(fused_conv2d,
OneDNN,
ONEDNN,
phi::FusedConv2DKernel,
float,
phi::dtype::bfloat16,
uint8_t,
int8_t) {}
PD_REGISTER_KERNEL(
fused_conv3d, OneDNN, ONEDNN, phi::FusedConv3DKernel, float) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/data_layout_transform.h"
#include "paddle/phi/kernels/onednn/conv_handler.h"
namespace phi {
static dnnl::memory::data_type GetDstType(
bool is_int8,
bool is_bfloat16,
bool force_fp32_output,
std::string fuse_activation,
bool fuse_residual_conn,
const phi::DenseTensor* residual_param) {
auto dst_dt = dnnl::memory::data_type::f32;
if (is_int8) {
dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
? dnnl::memory::data_type::u8
: dnnl::memory::data_type::s8;
if (force_fp32_output) {
dst_dt = dnnl::memory::data_type::f32;
}
if (fuse_residual_conn && residual_param) {
auto residual_dt = funcs::ToOneDNNDataType(residual_param->dtype());
if (dst_dt != residual_dt) dst_dt = residual_dt;
}
} else {
if (!force_fp32_output && is_bfloat16) {
dst_dt = dnnl::memory::data_type::bf16;
if (fuse_residual_conn && residual_param) {
dst_dt = funcs::ToOneDNNDataType(residual_param->dtype());
}
}
}
return dst_dt;
}
#define PD_VISIT_FLOAT_AND_INT8_TYPES(TYPE, NAME, ...) \
[&] { \
const auto& __dtype__ = TYPE; \
switch (__dtype__) { \
PD_PRIVATE_CASE_TYPE( \
NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \
default: \
PD_THROW("function " #NAME " is not implemented for data type `", \
__dtype__, \
"`"); \
} \
}()
template <typename T, typename T_out>
void ComputeFP32(const OneDNNContext& dev_ctx,
const DenseTensor* input,
const DenseTensor* filter,
const DenseTensor* bias,
const DenseTensor* residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
bool is_test,
bool is_BFLOAT16,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* output) {
const auto& onednn_engine = dev_ctx.GetEngine();
const bool is_conv3d = strides.size() == 3U;
const std::string& unique_name =
dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
PD_VISIT_FLOAT_AND_INT8_TYPES(
filter->dtype(), "ConvOneDNNHandlerT", ([&] {
onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
onednn_engine,
dev_ctx.GetPlace(),
input,
filter,
bias,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
output,
unique_name);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
filter, groups, is_conv3d, is_test);
std::shared_ptr<dnnl::memory> dst_memory_p;
if (fuse_residual_conn) {
dst_memory_p =
handler.AcquireDstMemoryWithResidual(output, residual_param);
} else {
dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
}
auto conv_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC, *src_memory_p},
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}};
if (bias) {
auto bias_memory_p =
handler.AcquireBiasMemoryWithReorder(bias, is_test);
args.insert({DNNL_ARG_BIAS, *bias_memory_p});
}
auto& astream = OneDNNContext::tls().get_stream();
conv_p->execute(astream, args);
astream.wait();
output->set_mem_desc(dst_memory_p->get_desc());
}));
}
template <typename T, typename T_out>
void ComputeINT8(const OneDNNContext& dev_ctx,
const DenseTensor* input,
const DenseTensor* filter,
const DenseTensor* bias,
const DenseTensor* residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
bool is_test,
bool is_BFLOAT16,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* output) {
const auto& onednn_engine = dev_ctx.GetEngine();
const bool is_conv3d = strides.size() == 3U;
bool unsigned_output =
(fuse_activation == "relu" || fuse_activation == "relu6");
bool need_s8_to_u8 = false;
PADDLE_ENFORCE_NE(
is_conv3d,
true,
phi::errors::Unimplemented(
"OneDNN int8 convolution does not support 3D inputs currently"));
PADDLE_ENFORCE_EQ(
fuse_residual_conn && force_fp32_output,
false,
phi::errors::Unimplemented(
"residual fusion does not support force output with fp32"));
const std::string& unique_name =
dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
PD_VISIT_FLOAT_AND_INT8_TYPES(
filter->dtype(), "ConvMKLDNNHandlerT", ([&] {
onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
onednn_engine,
dev_ctx.GetPlace(),
input,
filter,
bias,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
output,
unique_name);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
const auto& scale_weights_data =
dev_ctx.HasDnnAttr("Scale_weights")
? PADDLE_GET_CONST(std::vector<float>,
dev_ctx.GetDnnAttr("Scale_weights"))
: std::vector<float>{1.0f};
const bool is_multi_channel = scale_weights_data.size() > 1;
int mask_reorder = is_multi_channel
? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0)
: 0;
auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
filter, groups, false, true, scale_weights_data, mask_reorder);
std::shared_ptr<dnnl::memory> dst_memory_p;
if (fuse_residual_conn) {
PADDLE_ENFORCE_EQ(
output->dims(),
residual_param->dims(),
phi::errors::InvalidArgument(
"Output and elementwise parameter need to have the "
"same dimension sizes, but got output's dimension = %d"
" and residual param's dimension =%d .",
output->dims().size(),
residual_param->dims().size()));
dst_memory_p =
handler.AcquireDstMemoryWithResidual(output, residual_param);
need_s8_to_u8 = (funcs::OneDNNGetDataType<T_out>() ==
dnnl::memory::data_type::s8) &&
unsigned_output;
} else {
dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
}
auto conv_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC, *src_memory_p},
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}};
if (bias) {
std::vector<float> bias_scales;
auto p_scales_tuple =
std::make_shared<std::tuple<float, std::vector<float>>>(
std::make_tuple(static_cast<float>(mask_reorder),
bias_scales));
if (dev_ctx.HasDnnAttr("Bias_scales")) {
bias_scales = PADDLE_GET_CONST(std::vector<float>,
dev_ctx.GetDnnAttr("Bias_scales"));
p_scales_tuple =
std::make_shared<std::tuple<float, std::vector<float>>>(
std::make_tuple(static_cast<float>(mask_reorder),
bias_scales));
} else {
p_scales_tuple = handler.get_int8_bias_scales(
filter, groups, scale_weights_data);
}
auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
bias,
true,
std::get<1>(*p_scales_tuple),
std::get<0>(*p_scales_tuple));
args.insert({DNNL_ARG_BIAS, *bias_memory_p});
}
auto& astream = OneDNNContext::tls().get_stream();
conv_p->execute(astream, args);
astream.wait();
if (need_s8_to_u8) {
dev_ctx.Alloc<uint8_t>(output);
}
output->set_mem_desc(dst_memory_p->get_desc());
}));
}
template <typename T, typename Context>
void ConvOnednn(const Context& dev_ctx,
const DenseTensor* input,
const DenseTensor* filter,
const DenseTensor* bias,
const DenseTensor* residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
bool is_test,
bool is_bfloat16,
const std::string& fuse_activation,
bool fuse_residual_connection,
bool force_fp32_output,
DenseTensor* out) {
PADDLE_ENFORCE_EQ(
dev_ctx.GetPlace().GetType(),
AllocationType::CPU,
phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace"));
bool is_INT8 = funcs::is_int8<T>();
auto dst_dt = GetDstType(is_INT8,
is_bfloat16,
force_fp32_output,
fuse_activation,
fuse_residual_connection,
residual_param);
if (!is_INT8) {
if (dst_dt == dnnl::memory::data_type::f32) {
ComputeFP32<T, float>(dev_ctx,
input,
filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_bfloat16,
fuse_activation,
fuse_residual_connection,
force_fp32_output,
out);
} else if (dst_dt == dnnl::memory::data_type::bf16) {
ComputeFP32<T, dtype::bfloat16>(dev_ctx,
input,
filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_bfloat16,
fuse_activation,
fuse_residual_connection,
force_fp32_output,
out);
}
} else {
if (dst_dt == dnnl::memory::data_type::f32) {
ComputeINT8<T, float>(dev_ctx,
input,
filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_bfloat16,
fuse_activation,
fuse_residual_connection,
force_fp32_output,
out);
} else if (dst_dt == dnnl::memory::data_type::u8) {
ComputeINT8<T, uint8_t>(dev_ctx,
input,
filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_bfloat16,
fuse_activation,
fuse_residual_connection,
force_fp32_output,
out);
} else if (dst_dt == dnnl::memory::data_type::s8) {
ComputeINT8<T, int8_t>(dev_ctx,
input,
filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_bfloat16,
fuse_activation,
fuse_residual_connection,
force_fp32_output,
out);
}
}
}
} // namespace phi
...@@ -17,265 +17,10 @@ ...@@ -17,265 +17,10 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/visit_type.h" #include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/data_layout_transform.h" #include "paddle/phi/kernels/funcs/data_layout_transform.h"
#include "paddle/phi/kernels/onednn/conv_handler.h" #include "paddle/phi/kernels/onednn/conv_function.h"
namespace phi { namespace phi {
static dnnl::memory::data_type GetDstType(
bool is_int8,
bool is_bfloat16,
bool force_fp32_output,
std::string fuse_activation,
bool fuse_residual_conn,
const phi::DenseTensor* residual_param) {
auto dst_dt = dnnl::memory::data_type::f32;
if (is_int8) {
dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
? dnnl::memory::data_type::u8
: dnnl::memory::data_type::s8;
if (force_fp32_output) {
dst_dt = dnnl::memory::data_type::f32;
}
if (fuse_residual_conn && residual_param) {
auto residual_dt = funcs::ToOneDNNDataType(residual_param->dtype());
if (dst_dt != residual_dt) dst_dt = residual_dt;
}
} else {
if (!force_fp32_output && is_bfloat16) {
dst_dt = dnnl::memory::data_type::bf16;
if (fuse_residual_conn && residual_param) {
dst_dt = funcs::ToOneDNNDataType(residual_param->dtype());
}
}
}
return dst_dt;
}
#define PD_VISIT_FLOAT_AND_INT8_TYPES(TYPE, NAME, ...) \
[&] { \
const auto& __dtype__ = TYPE; \
switch (__dtype__) { \
PD_PRIVATE_CASE_TYPE( \
NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__) \
default: \
PD_THROW("function " #NAME " is not implemented for data type `", \
__dtype__, \
"`"); \
} \
}()
template <typename T, typename T_out>
void ComputeFP32(const OneDNNContext& dev_ctx,
const DenseTensor* input,
const DenseTensor* filter,
const DenseTensor* bias,
const DenseTensor* residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
bool is_test,
bool is_BFLOAT16,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* output) {
const auto& onednn_engine = dev_ctx.GetEngine();
const bool is_conv3d = strides.size() == 3U;
const std::string& unique_name =
dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
PD_VISIT_FLOAT_AND_INT8_TYPES(
filter->dtype(), "ConvOneDNNHandlerT", ([&] {
onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
onednn_engine,
dev_ctx.GetPlace(),
input,
filter,
bias,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
output,
unique_name);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
filter, groups, is_conv3d, is_test);
std::shared_ptr<dnnl::memory> dst_memory_p;
if (fuse_residual_conn) {
dst_memory_p =
handler.AcquireDstMemoryWithResidual(output, residual_param);
} else {
dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
}
auto conv_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC, *src_memory_p},
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}};
if (bias) {
auto bias_memory_p =
handler.AcquireBiasMemoryWithReorder(bias, is_test);
args.insert({DNNL_ARG_BIAS, *bias_memory_p});
}
auto& astream = OneDNNContext::tls().get_stream();
conv_p->execute(astream, args);
astream.wait();
output->set_mem_desc(dst_memory_p->get_desc());
}));
}
template <typename T, typename T_out>
void ComputeINT8(const OneDNNContext& dev_ctx,
const DenseTensor* input,
const DenseTensor* filter,
const DenseTensor* bias,
const DenseTensor* residual_param,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
const std::vector<int>& dilations,
int groups,
const std::string& data_format,
bool is_test,
bool is_BFLOAT16,
const std::string& fuse_activation,
bool fuse_residual_conn,
bool force_fp32_output,
DenseTensor* output) {
const auto& onednn_engine = dev_ctx.GetEngine();
const bool is_conv3d = strides.size() == 3U;
bool unsigned_output =
(fuse_activation == "relu" || fuse_activation == "relu6");
bool need_s8_to_u8 = false;
PADDLE_ENFORCE_NE(
is_conv3d,
true,
phi::errors::Unimplemented(
"OneDNN int8 convolution does not support 3D inputs currently"));
PADDLE_ENFORCE_EQ(
fuse_residual_conn && force_fp32_output,
false,
phi::errors::Unimplemented(
"residual fusion does not support force output with fp32"));
const std::string& unique_name =
dev_ctx.GetInputsName("Input")[0] + dev_ctx.GetInputsName("Filter")[0];
PD_VISIT_FLOAT_AND_INT8_TYPES(
filter->dtype(), "ConvMKLDNNHandlerT", ([&] {
onednn::ConvOneDNNHandlerT<T, data_t, T_out> handler(dev_ctx,
onednn_engine,
dev_ctx.GetPlace(),
input,
filter,
bias,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
output,
unique_name);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
const auto& scale_weights_data =
dev_ctx.HasDnnAttr("Scale_weights")
? PADDLE_GET_CONST(std::vector<float>,
dev_ctx.GetDnnAttr("Scale_weights"))
: std::vector<float>{1.0f};
const bool is_multi_channel = scale_weights_data.size() > 1;
int mask_reorder = is_multi_channel
? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0)
: 0;
auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
filter, groups, false, true, scale_weights_data, mask_reorder);
std::shared_ptr<dnnl::memory> dst_memory_p;
if (fuse_residual_conn) {
PADDLE_ENFORCE_EQ(
output->dims(),
residual_param->dims(),
phi::errors::InvalidArgument(
"Output and elementwise parameter need to have the "
"same dimension sizes, but got output's dimension = %d"
" and residual param's dimension =%d .",
output->dims().size(),
residual_param->dims().size()));
dst_memory_p =
handler.AcquireDstMemoryWithResidual(output, residual_param);
need_s8_to_u8 = (funcs::OneDNNGetDataType<T_out>() ==
dnnl::memory::data_type::s8) &&
unsigned_output;
} else {
dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
}
auto conv_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC, *src_memory_p},
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}};
if (bias) {
std::vector<float> bias_scales;
auto p_scales_tuple =
std::make_shared<std::tuple<float, std::vector<float>>>(
std::make_tuple(static_cast<float>(mask_reorder),
bias_scales));
if (dev_ctx.HasDnnAttr("Bias_scales")) {
bias_scales = PADDLE_GET_CONST(std::vector<float>,
dev_ctx.GetDnnAttr("Bias_scales"));
p_scales_tuple =
std::make_shared<std::tuple<float, std::vector<float>>>(
std::make_tuple(static_cast<float>(mask_reorder),
bias_scales));
} else {
p_scales_tuple = handler.get_int8_bias_scales(
filter, groups, scale_weights_data);
}
auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
bias,
true,
std::get<1>(*p_scales_tuple),
std::get<0>(*p_scales_tuple));
args.insert({DNNL_ARG_BIAS, *bias_memory_p});
}
auto& astream = OneDNNContext::tls().get_stream();
conv_p->execute(astream, args);
astream.wait();
if (need_s8_to_u8) {
dev_ctx.Alloc<uint8_t>(output);
}
output->set_mem_desc(dst_memory_p->get_desc());
}));
}
template <typename T, typename Context> template <typename T, typename Context>
void ConvKernel(const Context& dev_ctx, void ConvKernel(const Context& dev_ctx,
const DenseTensor& input, const DenseTensor& input,
...@@ -287,12 +32,6 @@ void ConvKernel(const Context& dev_ctx, ...@@ -287,12 +32,6 @@ void ConvKernel(const Context& dev_ctx,
int groups, int groups,
const std::string& data_format, const std::string& data_format,
DenseTensor* out) { DenseTensor* out) {
PADDLE_ENFORCE_EQ(
dev_ctx.GetPlace().GetType(),
AllocationType::CPU,
phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace"));
bool is_INT8 = funcs::is_int8<T>();
bool is_test = dev_ctx.HasDnnAttr("is_test") bool is_test = dev_ctx.HasDnnAttr("is_test")
? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test"))
: false; : false;
...@@ -320,107 +59,23 @@ void ConvKernel(const Context& dev_ctx, ...@@ -320,107 +59,23 @@ void ConvKernel(const Context& dev_ctx,
dev_ctx.HasDnnAttr("force_fp32_output") dev_ctx.HasDnnAttr("force_fp32_output")
? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
: false; : false;
auto dst_dt = GetDstType(is_INT8, ConvOnednn<T>(dev_ctx,
is_BFLOAT16, &input,
force_fp32_output, &filter,
fuse_activation, bias,
fuse_residual_conn, residual_param,
residual_param); strides,
if (!is_INT8) { paddings,
if (dst_dt == dnnl::memory::data_type::f32) { padding_algorithm,
ComputeFP32<T, float>(dev_ctx, dilations,
&input, groups,
&filter, data_format,
bias, is_test,
residual_param, is_BFLOAT16,
strides, fuse_activation,
paddings, fuse_residual_conn,
padding_algorithm, force_fp32_output,
dilations, out);
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
} else if (dst_dt == dnnl::memory::data_type::bf16) {
ComputeFP32<T, dtype::bfloat16>(dev_ctx,
&input,
&filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
}
} else {
if (dst_dt == dnnl::memory::data_type::f32) {
ComputeINT8<T, float>(dev_ctx,
&input,
&filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
} else if (dst_dt == dnnl::memory::data_type::u8) {
ComputeINT8<T, uint8_t>(dev_ctx,
&input,
&filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
} else if (dst_dt == dnnl::memory::data_type::s8) {
ComputeINT8<T, int8_t>(dev_ctx,
&input,
&filter,
bias,
residual_param,
strides,
paddings,
padding_algorithm,
dilations,
groups,
data_format,
is_test,
is_BFLOAT16,
fuse_activation,
fuse_residual_conn,
force_fp32_output,
out);
}
}
} }
template <typename T, typename Context> template <typename T, typename Context>
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature FusedConv2dOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fused_conv2d",
{"Input", "Filter", "Bias", "ResidualData"},
{"strides",
"paddings",
"padding_algorithm",
"dilations",
"groups",
"data_format",
"mkldnn_data_type",
"fuse_activation",
"fuse_residual_connection",
"force_fp32_output"},
{"Output"});
}
KernelSignature FusedConv3dOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("fused_conv3d",
{"Input", "Filter", "Bias", "ResidualData"},
{"strides",
"paddings",
"padding_algorithm",
"dilations",
"groups",
"data_format",
"mkldnn_data_type",
"fuse_activation",
"fuse_residual_connection",
"force_fp32_output"},
{"Output"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(fused_conv2d, phi::FusedConv2dOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(fused_conv3d, phi::FusedConv3dOpArgumentMapping);
...@@ -36,7 +36,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest): ...@@ -36,7 +36,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
# MKLDNN # MKLDNN
config = self.create_inference_config(use_gpu=False) config = self.create_inference_config(use_gpu=False)
config.enable_mkldnn() config.enable_mkldnn()
yield config, ["conv2d"], (1e-4, 1e-5) yield config, ["fused_conv2d"], (1e-4, 1e-5)
def is_program_valid(self, prog_config): def is_program_valid(self, prog_config):
paddings = prog_config.ops[0].attrs["paddings"] paddings = prog_config.ops[0].attrs["paddings"]
...@@ -156,8 +156,10 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest): ...@@ -156,8 +156,10 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
inputs = dict() inputs = dict()
weights = dict() weights = dict()
use_mkldnn = None use_mkldnn = None
conv_type = "conv2d"
if draw(st.booleans()): if draw(st.booleans()):
conv_bias_shape = [f_shape[0]] conv_bias_shape = [f_shape[0]]
conv_type = "fused_conv2d"
inputs = { inputs = {
"Input": ["input_x"], "Input": ["input_x"],
"Filter": ["filter"], "Filter": ["filter"],
...@@ -181,7 +183,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest): ...@@ -181,7 +183,7 @@ class TestConvBiasMkldnnFusePass(PassAutoScanTest):
use_mkldnn = False use_mkldnn = False
conv2d_op = OpConfig( conv2d_op = OpConfig(
"conv2d", conv_type,
inputs=inputs, inputs=inputs,
outputs={"Output": ["conv2d_out"]}, outputs={"Output": ["conv2d_out"]},
strides=strides, strides=strides,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册