diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 2a04db1519431cb2608c8f39997581dc3bc63973..cea2a45c5db15891a4de679265a9c2cd2779d0fb 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -67,3 +67,7 @@ USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); USE_MIR_PASS(__xpu__fc_fuse_pass); USE_MIR_PASS(__xpu__mmdnn_fuse_pass); +USE_MIR_PASS(__xpu__conv2d_fuse_pass); +USE_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass); +USE_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass); +USE_MIR_PASS(__xpu__sfa_head_moment_fuse_pass); diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index dbf9b69d42e5b6abf0640a113d80a74dbb71dff6..0fe572e1f91919d739199163b7ff5c989e6cd519 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -30,6 +30,10 @@ lite_cc_library(mir_passes fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc fusion/__xpu__fc_fuse_pass.cc fusion/__xpu__mmdnn_fuse_pass.cc + fusion/__xpu__conv2d_fuse_pass.cc + fusion/__xpu__conv2d_link_previous_out_max_pass.cc + fusion/__xpu__sfa_head_meanstd_fuse_pass.cc + fusion/__xpu__sfa_head_moment_fuse_pass.cc fusion/match_matrix_activation_fuse_pass.cc fusion/scales_fuse_pass.cc fusion/sequence_reverse_embedding_fuse_pass.cc diff --git a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..d8e9d9db4664cd717dbc949134e5ef52f52c9b61 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc @@ -0,0 +1,475 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { +/* fuse conv2d block in resnet50-like model to xpu_conv2d op */ +/* For example: */ +/* graph[1]: sub block */ +/* in_Input */ +/* | */ +/* | */ +/* conv2d----in_Filter */ +/* | */ +/* | */ +/* batch_norm ------in_Bias */ +/* | */ +/* | */ +/* relu */ +/* | */ +/* | */ +/* out_Out */ +/* */ +/* After the pass is applied: */ +/* in_Input */ +/* in_Filter | in_FilterMax */ +/* \ | / */ +/* \ | / */ +/* in_Bias ------- __xpu__conv2d */ +/* | \ */ +/* | \ */ +/* | out_OutputMax */ +/* out_Output */ +/* */ +/* ------------------------------------------------------ */ +/* graph[2]: sub block */ +/* in_Input */ +/* | */ +/* | */ +/* conv2d----in_Filter */ +/* | */ +/* | */ +/* batch_norm ------in_Bias */ +/* | */ +/* | */ +/* out_Out */ +/* */ +/* After the pass is applied: */ +/* in_Input */ +/* in_Filter | in_FilterMax */ +/* \ | / */ +/* \ | / */ +/* in_Bias ------- __xpu__conv2d */ +/* | \ */ +/* | \ */ +/* | out_OutputMax */ +/* out_Output */ +/* */ +/* ------------------------------------------------------ */ +/* graph[3]: sub block */ +/* in_Input */ +/* | */ +/* | */ +/* conv2d----in_Filter */ +/* | */ +/* | */ +/* in_X batch_norm ------in_Bias */ +/* \ | */ +/* \ | */ +/* elementwise_add */ +/* | */ +/* | */ +/* relu */ +/* | */ +/* | */ +/* out_Out */ +/* */ +/* After the pass is applied: */ +/* in_Input */ +/* in_Filter | in_FilterMax */ +/* \ | / */ +/* \ | / */ +/* in_Branch ------- __xpu__conv2d ------ in_Bias */ +/* | \ */ +/* | \ */ +/* | out_OutputMax */ +/* out_Output */ + +class XPUConv2dBlock0Fuser : public FuseBase { + public: + explicit XPUConv2dBlock0Fuser(bool with_relu) : _with_relu(with_relu) {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* conv_filter = VarNode("conv_filter") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* conv = OpNode("conv", "conv2d")->AsIntermediate(); + auto* conv_out = VarNode("conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* bn_bias = + VarNode("bn_bias")->assert_is_op_input("batch_norm", "Bias")->AsInput(); + auto* bn_mean = VarNode("bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* bn_scale = VarNode("bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* bn_var = VarNode("bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* bn = OpNode("bn", "batch_norm")->AsIntermediate(); + auto* bn_out = VarNode("bn_out")->assert_is_op_output("batch_norm", "Y"); + auto* bn_mean_out = VarNode("bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* bn_saved_mean = VarNode("bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* bn_var_out = VarNode("bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* bn_saved_var = + VarNode("bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + *input >> *conv >> *conv_out >> *bn >> *bn_out; + + *conv_filter >> *conv; + *bn_bias >> *bn; + *bn_mean >> *bn; + *bn_scale >> *bn; + *bn_var >> *bn; + *bn >> *bn_mean_out; + *bn >> *bn_saved_mean; + *bn >> *bn_saved_var; + *bn >> *bn_var_out; + + if (_with_relu) { + bn_out->assert_is_op_input("relu", "X")->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *bn_out >> *relu >> *relu_out; + } else { + bn_out->AsOutput(); + } + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto op_desc = *matched.at("conv")->stmt()->op_info(); + auto conv_old = matched.at("conv")->stmt()->op(); + auto* scope = conv_old->scope(); + op_desc.mutable_inputs()->clear(); + op_desc.mutable_outputs()->clear(); + op_desc.SetType("__xpu__conv2d"); + std::string input_name = matched.at("input")->arg()->name; + op_desc.SetInput("Input", {input_name}); + + auto filter_name = matched.at("conv_filter")->arg()->name; + auto scale_name = matched.at("bn_scale")->arg()->name; + auto bias_name = matched.at("bn_bias")->arg()->name; + auto mean_name = matched.at("bn_mean")->arg()->name; + auto var_name = matched.at("bn_variance")->arg()->name; + + auto* filter_t = scope->FindMutableTensor(filter_name); + auto* scale_t = scope->FindMutableTensor(scale_name); + auto* bias_t = scope->FindMutableTensor(bias_name); + auto* mean_t = scope->FindMutableTensor(mean_name); + auto* var_t = scope->FindMutableTensor(var_name); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_filter_name = filter_name + "_max"; + auto* max_filter_node = graph->NewArgumentNode(max_filter_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + + auto* max_filter_t = scope->NewTensor(max_filter_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + + op_desc.SetInput("Filter", {filter_name}); + op_desc.SetInput("Bias", {bias_name}); + op_desc.SetInput("FilterMax", {max_filter_name}); + + std::string output_name = ""; + if (_with_relu) { + output_name = matched.at("relu_out")->arg()->name; + } else { + output_name = matched.at("bn_out")->arg()->name; + } + op_desc.SetOutput("Output", {output_name}); + + // add new arg output_max + std::string max_output_name = output_name + "_max"; + auto* max_output_node = graph->NewArgumentNode(max_output_name); + max_output_node->arg()->type = LiteType::GetTensorTy( + TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); + scope->NewTensor(max_output_name); + op_desc.SetOutput("OutputMax", {max_output_name}); + + auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d"); + auto& valid_places = conv_old->valid_places(); + conv_op->Attach(op_desc, scope); + auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places); + DirectedLink(matched.at("input"), new_op_node); + DirectedLink(matched.at("conv_filter"), new_op_node); + DirectedLink(matched.at("bn_bias"), new_op_node); + DirectedLink(max_filter_node, new_op_node); + DirectedLink(new_op_node, max_output_node); + if (_with_relu) { + DirectedLink(new_op_node, matched.at("relu_out")); + } else { + DirectedLink(new_op_node, matched.at("bn_out")); + } + } + + private: + bool _with_relu; +}; + +// block with branch +class XPUConv2dBlock1Fuser : public FuseBase { + public: + XPUConv2dBlock1Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* conv_filter = VarNode("conv_filter") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* conv = OpNode("conv", "conv2d")->AsIntermediate(); + auto* conv_out = VarNode("conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* bn_bias = + VarNode("bn_bias")->assert_is_op_input("batch_norm", "Bias")->AsInput(); + auto* bn_mean = VarNode("bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* bn_scale = VarNode("bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* bn_var = VarNode("bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* bn = OpNode("bn", "batch_norm")->AsIntermediate(); + auto* bn_out = VarNode("bn_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* bn_mean_out = VarNode("bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* bn_saved_mean = VarNode("bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* bn_var_out = VarNode("bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* bn_saved_var = + VarNode("bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* ew_x = + VarNode("ew_x")->assert_is_op_input("elementwise_add", "X")->AsInput(); + auto* ew_add = OpNode("ew_add", "elementwise_add")->AsIntermediate(); + auto* ew_out = VarNode("ew_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *conv >> *conv_out >> *bn >> *bn_out >> *ew_add >> *ew_out >> + *relu >> *relu_out; + + *conv_filter >> *conv; + *bn_bias >> *bn; + *bn_mean >> *bn; + *bn_scale >> *bn; + *bn_var >> *bn; + *bn >> *bn_mean_out; + *bn >> *bn_saved_mean; + *bn >> *bn_saved_var; + *bn >> *bn_var_out; + + *ew_x >> *ew_add; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto op_desc = *matched.at("conv")->stmt()->op_info(); + auto conv_old = matched.at("conv")->stmt()->op(); + auto* scope = conv_old->scope(); + op_desc.mutable_inputs()->clear(); + op_desc.mutable_outputs()->clear(); + op_desc.SetType("__xpu__conv2d"); + std::string input_name = matched.at("input")->arg()->name; + op_desc.SetInput("Input", {input_name}); + + auto filter_name = matched.at("conv_filter")->arg()->name; + auto scale_name = matched.at("bn_scale")->arg()->name; + auto bias_name = matched.at("bn_bias")->arg()->name; + auto mean_name = matched.at("bn_mean")->arg()->name; + auto var_name = matched.at("bn_variance")->arg()->name; + + auto* filter_t = scope->FindMutableTensor(filter_name); + auto* scale_t = scope->FindMutableTensor(scale_name); + auto* bias_t = scope->FindMutableTensor(bias_name); + auto* mean_t = scope->FindMutableTensor(mean_name); + auto* var_t = scope->FindMutableTensor(var_name); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_filter_name = filter_name + "_max"; + auto* max_filter_node = graph->NewArgumentNode(max_filter_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + + auto* max_filter_t = scope->NewTensor(max_filter_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + + op_desc.SetInput("Filter", {filter_name}); + op_desc.SetInput("Bias", {bias_name}); + op_desc.SetInput("FilterMax", {max_filter_name}); + op_desc.SetInput("Branch", {matched.at("ew_x")->arg()->name}); + + std::string output_name = matched.at("relu_out")->arg()->name; + op_desc.SetOutput("Output", {output_name}); + + // add new arg output_max + std::string max_output_name = output_name + "_max"; + auto* max_output_node = graph->NewArgumentNode(max_output_name); + max_output_node->arg()->type = LiteType::GetTensorTy( + TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); + scope->NewTensor(max_output_name); + op_desc.SetOutput("OutputMax", {max_output_name}); + + auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d"); + auto& valid_places = conv_old->valid_places(); + conv_op->Attach(op_desc, scope); + auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places); + DirectedLink(matched.at("input"), new_op_node); + DirectedLink(matched.at("conv_filter"), new_op_node); + DirectedLink(matched.at("bn_bias"), new_op_node); + DirectedLink(matched.at("ew_x"), new_op_node); + DirectedLink(max_filter_node, new_op_node); + DirectedLink(new_op_node, matched.at("relu_out")); + DirectedLink(new_op_node, max_output_node); + } +}; + +} // namespace fusion + +class XPUConv2dFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + + fusion::XPUConv2dBlock1Fuser fuser; /* branch fuse */ + fuser(graph.get()); + + fusion::XPUConv2dBlock0Fuser fuser1(true /* with_relu */); + fuser1(graph.get()); + + fusion::XPUConv2dBlock0Fuser fuser2(false /* with_relu */); + fuser2(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__conv2d_fuse_pass, paddle::lite::mir::XPUConv2dFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc b/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..d294f1f2f7cb440bad79035353989711a59f89d2 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__conv2d_link_previous_out_max_pass.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { +/* link the previous __xpu__conv2d's OutputMax to */ +/* next __xpu__conv2d as InputMax */ +/* For example: */ +/* graph[1]: sub block */ +/* in_Input */ +/* in_Filter | in_FilterMax */ +/* \ | / */ +/* \ | / */ +/* in_Bias ------- __xpu__conv2d */ +/* | \ */ +/* | \ */ +/* out_Output out_OutputMax */ +/* | */ +/* | */ +/* __xpu__conv2d */ +/* | */ +/* | */ +/* out_Output */ +/* */ +/* After the pass is applied: */ +/* in_Input */ +/* in_Filter | in_FilterMax */ +/* \ | / */ +/* \ | / */ +/* in_Bias ------- __xpu__conv2d */ +/* | \ */ +/* | \ */ +/* out_Output out_OutputMax */ +/* | / */ +/* | / */ +/* __xpu__conv2d */ +/* | */ +/* | */ +/* out_Output */ + +class XPUConv2dLinkFuser : public FuseBase { + public: + explicit XPUConv2dLinkFuser(bool with_branch) : _with_branch(with_branch) {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("__xpu__conv2d", "Input") + ->AsInput(); + auto* filter = VarNode("filter") + ->assert_is_op_input("__xpu__conv2d", "Filter") + ->AsInput(); + auto* filter_max = VarNode("filter_max") + ->assert_is_op_input("__xpu__conv2d", "FilterMax") + ->AsInput(); + auto* bias = + VarNode("bias")->assert_is_op_input("__xpu__conv2d", "Bias")->AsInput(); + auto* xpu_conv = OpNode("xpu_conv", "__xpu__conv2d"); + auto* xpu_conv_out = VarNode("xpu_conv_out") + ->assert_is_op_output("__xpu__conv2d", "Output") + ->AsOutput(); + auto* xpu_conv_out_max = + VarNode("xpu_conv_out_max") + ->assert_is_op_output("__xpu__conv2d", "OutputMax") + ->AsOutput(); + + *input >> *xpu_conv >> *xpu_conv_out; + *filter >> *xpu_conv; + *filter_max >> *xpu_conv; + *bias >> *xpu_conv; + *xpu_conv >> *xpu_conv_out_max; + + if (_with_branch) { + auto* branch = VarNode("branch") + ->assert_is_op_input("__xpu__conv2d", "Branch") + ->AsInput(); + *branch >> *xpu_conv; + } + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto conv_instruct = matched.at("xpu_conv")->stmt(); + auto op_desc = *conv_instruct->mutable_op_info(); + auto conv_old = conv_instruct->op(); + + // try to find input_max + std::string max_input_name = matched.at("input")->arg()->name + "_max"; + auto* max_input_node = graph->RetrieveArgument(max_input_name); + if (max_input_node != nullptr && + (!op_desc.HasAttr("has_input_max") || + !op_desc.GetAttr("has_input_max"))) { + op_desc.SetInput("InputMax", {max_input_name}); + op_desc.SetAttr("has_input_max", true); + conv_instruct->ResetOp(op_desc, conv_old->valid_places()); + DirectedLink(max_input_node, matched.at("xpu_conv")); + } + } + + private: + bool _with_branch; +}; + +} // namespace fusion + +class XPUConv2dLinkPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + + fusion::XPUConv2dLinkFuser fuser1(true); + fuser1(graph.get()); + + // TODO(sunsetlh): need fix bug in no branch case + fusion::XPUConv2dLinkFuser fuser2(false); + fuser2(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass, + paddle::lite::mir::XPUConv2dLinkPass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("__xpu__conv2d"); diff --git a/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc b/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..0af60e7fec36e8e21c0a59e30f562821c04f8978 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__sfa_head_meanstd_fuse_pass.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { +// Special fuse pass for the subgraph block in vis clarity model +// block desc: +// [["reduce_mean", +// ["concat"], +// ["elementwise_sub", +// ["square", ["reduce_sum", ["scale", ["sqrt"]]]]]]] + +class XPUSfaHeadMeanstdFuser : public FuseBase { + public: + void BuildPattern() override { + auto* reduce_mean_input = VarNode("reduce_mean_input") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("reduce_mean", "X") + ->AsInput(); + auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_out = VarNode("reduce_mean_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_nth_input("concat", "X", 0) + ->assert_is_op_input("elementwise_sub", "Y") + ->AsIntermediate(); + auto* elementwise_sub = + OpNode("elementwise_sub", "elementwise_sub")->AsIntermediate(); + auto* elementwise_sub_out = + VarNode("elementwise_sub_out") + ->assert_is_op_output("elementwise_sub", "Out") + ->assert_is_op_input("square", "X") + ->AsIntermediate(); + auto* square = OpNode("square", "square")->AsIntermediate(); + auto* square_out = VarNode("square_out") + ->assert_is_op_output("square", "Out") + ->assert_is_op_input("reduce_sum", "X") + ->AsIntermediate(); + auto* reduce_sum = OpNode("reduce_sum", "reduce_sum")->AsIntermediate(); + auto* reduce_sum_out = VarNode("reduce_sum_out") + ->assert_is_op_output("reduce_sum", "Out") + ->assert_is_op_input("elementwise_div", "X") + ->AsIntermediate(); + auto* fill_constant = + OpNode("fill_constant", "fill_constant")->AsIntermediate(); + auto* fill_constant_out = VarNode("fill_constant_out") + ->assert_is_op_output("fill_constant", "Out") + ->AsIntermediate(); + auto* elementwise_div = + OpNode("elementwise_div", "elementwise_div")->AsIntermediate(); + auto* elementwise_div_out = + VarNode("elementwise_div_out") + ->assert_is_op_output("elementwise_div", "Out") + ->assert_is_op_input("sqrt", "X") + ->AsIntermediate(); + auto* sqrt = OpNode("sqrt", "sqrt")->AsIntermediate(); + auto* sqrt_out = VarNode("sqrt_out") + ->assert_is_op_output("sqrt", "Out") + ->assert_is_op_nth_input("concat", "X", 1) + ->AsIntermediate(); + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* out = + VarNode("out")->assert_is_op_output("concat", "Out")->AsOutput(); + + std::vector elementwise_sub_inputs{reduce_mean_out, + reduce_mean_input}; + std::vector elementwise_div_inputs{reduce_sum_out, + fill_constant_out}; + std::vector concat_inputs{reduce_mean_out, sqrt_out}; + *reduce_mean_input >> *reduce_mean >> *reduce_mean_out; + elementwise_sub_inputs >> *elementwise_sub >> *elementwise_sub_out; + *elementwise_sub_out >> *square >> *square_out; + *square_out >> *reduce_sum >> *reduce_sum_out; + *fill_constant >> *fill_constant_out; + elementwise_div_inputs >> *elementwise_div >> *elementwise_div_out; + *elementwise_div_out >> *sqrt >> *sqrt_out; + concat_inputs >> *concat >> *out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto reduce_mean = matched.at("reduce_mean")->stmt()->op(); + auto* scope = reduce_mean->scope(); + auto op_desc = GenOpDesc(matched); + auto vis_op = LiteOpRegistry::Global().Create("__xpu__sfa_head"); + auto& valid_places = reduce_mean->valid_places(); + vis_op->Attach(op_desc, scope); + auto* new_op_node = graph->GraphCreateInstructNode(vis_op, valid_places); + + IR_NODE_LINK_TO(matched.at("reduce_mean_input"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("out")); + } + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc = *matched.at("reduce_mean")->stmt()->op_info(); + op_desc.mutable_inputs()->clear(); + op_desc.mutable_outputs()->clear(); + op_desc.SetType("__xpu__sfa_head"); + op_desc.SetInput("Input", {matched.at("reduce_mean_input")->arg()->name}); + op_desc.SetOutput("Output", {matched.at("out")->arg()->name}); + op_desc.SetAttr("op_type", std::string("meanstd")); + return op_desc; + } +}; + +} // namespace fusion + +class XPUSfaHeadMeanstdFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) { + return; + } + + fusion::XPUSfaHeadMeanstdFuser fuser; + fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass, + paddle::lite::mir::XPUSfaHeadMeanstdFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("reduce_mean"); diff --git a/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc b/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..6daf660500e36ef81640a77776573a3fb93ab5c9 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__sfa_head_moment_fuse_pass.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { +// Special fuse pass for the subgraph block in vis clarity model +// block desc: +// [["reduce_mean", +// ["concat"], +// ["elementwise_sub", +// ["square", ["reduce_mean", ["sqrt"]]], +// ["abs", ["pow", ["elementwise_mul", ["reduce_mean", ["abs", +// ["pow"]]]]]], +// ["sign"], +// ["abs", ["pow", ["reduce_mean", ["abs", ["pow"]]]]]]]] + +class XPUSfaHeadMomentFuser : public FuseBase { + public: + void BuildPattern() override { + auto* reduce_mean_input = VarNode("reduce_mean_input") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("reduce_mean", "X") + ->assert_is_op_input("elementwise_sub", "X") + ->AsInput(); + auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate(); + + auto* reduce_mean_out = VarNode("reduce_mean_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_nth_input("concat", "X", 0) + ->assert_is_op_input("elementwise_sub", "Y") + ->AsIntermediate(); + + auto* elementwise_sub = + OpNode("elementwise_sub", "elementwise_sub")->AsIntermediate(); + auto* elementwise_sub_out = + VarNode("elementwise_sub_out") + ->assert_is_op_output("elementwise_sub", "Out") + ->assert_is_op_input("square", "X") + ->assert_is_op_input("abs", "X") + ->assert_is_op_input("sign", "X") + ->AsIntermediate(); + + auto* square = OpNode("square", "square")->AsIntermediate(); + + auto* square_out = VarNode("square_out") + ->assert_is_op_output("square", "Out") + ->assert_is_op_input("reduce_mean", "X") + ->AsIntermediate(); + auto* reduce_mean_es = + OpNode("es_reduce_mean", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_out_es = VarNode("reduce_mean_out_es") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_input("sqrt", "X") + ->AsIntermediate(); + auto* sqrt = OpNode("sqrt", "sqrt")->AsIntermediate(); + auto* sqrt_out = VarNode("sqrt_out") + ->assert_is_op_output("sqrt", "Out") + ->assert_is_op_nth_input("concat", "X", 1) + ->AsIntermediate(); + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* out = + VarNode("out")->assert_is_op_output("concat", "Out")->AsOutput(); + + auto* abs_e2 = OpNode("e2_abs", "abs")->AsIntermediate(); + auto* abs_e2_out = VarNode("abs_e2_out") + ->assert_is_op_input("pow", "X") + ->assert_is_op_output("abs", "Out") + ->AsIntermediate(); + + auto* pow_e2 = OpNode("e2_pow", "pow")->AsIntermediate(); + auto* pow_e2_out = VarNode("pow_e2_out") + ->assert_is_op_input("elementwise_mul", "X") + ->assert_is_op_output("pow", "Out") + ->AsIntermediate(); + + auto* sign_e3 = OpNode("e3_sign", "sign")->AsIntermediate(); + auto* sign_e3_out = VarNode("sign_e3_out") + ->assert_is_op_input("elementwise_mul", "Y") + ->assert_is_op_output("sign", "Out") + ->AsIntermediate(); + + auto* elementwise_mul_top = + OpNode("elementwise_mul_top", "elementwise_mul")->AsIntermediate(); + auto* elementwise_mul_top_out = + VarNode("elementwise_mul_top_out") + ->assert_is_op_input("reduce_mean", "X") + ->assert_is_op_output("elementwise_mul", "Out") + ->AsIntermediate(); + auto* reduce_mean_e2 = + OpNode("reduce_mean_e2", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_e2_out = VarNode("reduce_mean_e2_out") + ->assert_is_op_input("abs", "X") + ->assert_is_op_input("sign", "X") + ->assert_is_op_output("reduce_mean", "Out") + ->AsIntermediate(); + auto* abs_e2_2 = OpNode("abs_e2_2", "abs")->AsIntermediate(); + auto* abs_e2_2_out = VarNode("abs_e2_2_out") + ->assert_is_op_input("pow", "X") + ->assert_is_op_output("abs", "Out") + ->AsIntermediate(); + auto* pow_e2_2 = OpNode("pow_e2_2", "pow")->AsIntermediate(); + auto* pow_e2_2_out = VarNode("pow_e2_2_out") + ->assert_is_op_nth_input("elementwise_mul", "X", 0) + ->assert_is_op_output("pow", "Out") + ->AsIntermediate(); + auto* sign_e3_2 = OpNode("sign_e3_2", "sign")->AsIntermediate(); + auto* sign_e3_2_out = VarNode("sign_e3_2_out") + ->assert_is_op_input("elementwise_mul", "Y") + ->assert_is_op_output("sign", "Out") + ->AsIntermediate(); + auto* elementwise_mul_bottom = + OpNode("elementwise_mul_bottom", "elementwise_mul")->AsIntermediate(); + auto* elementwise_mul_bottom_out = + VarNode("elementwise_mul_bottom_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_nth_input("concat", "X", 2) + ->AsIntermediate(); + + // e4 + auto* abs_e_4 = OpNode("abs_e_4", "abs")->AsIntermediate(); + auto* abs_e_4_out = VarNode("abs_e_4_out") + ->assert_is_op_output("abs", "Out") + ->assert_is_op_input("pow", "X") + ->AsIntermediate(); + auto* pow_e_4 = OpNode("pow_e_4", "pow")->AsIntermediate(); + auto* pow_e_4_out = VarNode("pow_e_4_out") + ->assert_is_op_output("pow", "Out") + ->assert_is_op_input("reduce_mean", "X") + ->AsIntermediate(); + auto* reduce_mean_4 = OpNode("reduce_mean_4")->AsIntermediate(); + auto* reduce_mean_4_out = VarNode("reduce_mean_4_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_input("abs", "X") + ->AsIntermediate(); + + auto* abs_e_4_2 = OpNode("abs_e_4_2", "abs")->AsIntermediate(); + auto* abs_e_4_2_out = VarNode("abs_e_4_2_out") + ->assert_is_op_output("abs", "Out") + ->assert_is_op_input("pow", "X") + ->AsIntermediate(); + + auto* pow_e_4_2 = OpNode("pow_e_4_2", "pow")->AsIntermediate(); + auto* pow_e_4_2_out = VarNode("pow_e_4_2_out") + ->assert_is_op_output("pow", "Out") + ->assert_is_op_nth_input("concat", "X", 3) + ->AsIntermediate(); + + std::vector elementwise_sub_inputs{reduce_mean_input, + reduce_mean_out}; + + *reduce_mean_input >> *reduce_mean >> *reduce_mean_out; + elementwise_sub_inputs >> *elementwise_sub >> *elementwise_sub_out; + *elementwise_sub_out >> *square >> *square_out; + *square_out >> *reduce_mean_es >> *reduce_mean_out_es; + *reduce_mean_out_es >> *sqrt >> *sqrt_out; + + *elementwise_sub_out >> *sign_e3 >> *sign_e3_out; + + std::vector elementwise_mul_top_inputs{pow_e2_out, sign_e3_out}; + *elementwise_sub_out >> *abs_e2 >> *abs_e2_out; + *abs_e2_out >> *pow_e2 >> *pow_e2_out; + elementwise_mul_top_inputs >> *elementwise_mul_top >> + *elementwise_mul_top_out; + + *elementwise_mul_top_out >> *reduce_mean_e2 >> *reduce_mean_e2_out; + *reduce_mean_e2_out >> *abs_e2_2 >> *abs_e2_2_out; + *abs_e2_2_out >> *pow_e2_2 >> *pow_e2_2_out; + + *reduce_mean_e2_out >> *sign_e3_2 >> *sign_e3_2_out; + + std::vector elementwise_mul_bottom_inputs{pow_e2_2_out, + sign_e3_2_out}; + elementwise_mul_bottom_inputs >> *elementwise_mul_bottom >> + *elementwise_mul_bottom_out; + + *elementwise_sub_out >> *abs_e_4 >> *abs_e_4_out; + *abs_e_4_out >> *pow_e_4 >> *pow_e_4_out; + *pow_e_4_out >> *reduce_mean_4 >> *reduce_mean_4_out; + *reduce_mean_4_out >> *abs_e_4_2 >> *abs_e_4_2_out; + *abs_e_4_2_out >> *pow_e_4_2 >> *pow_e_4_2_out; + + std::vector concat_inputs{ + reduce_mean_out, sqrt_out, elementwise_mul_bottom_out, pow_e_4_2_out}; + concat_inputs >> *concat >> *out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + auto reduce_mean = matched.at("reduce_mean")->stmt()->op(); + auto* scope = reduce_mean->scope(); + auto op_desc = GenOpDesc(matched); + auto vis_op = LiteOpRegistry::Global().Create("__xpu__sfa_head"); + auto& valid_places = reduce_mean->valid_places(); + vis_op->Attach(op_desc, scope); + auto* new_op_node = graph->GraphCreateInstructNode(vis_op, valid_places); + + IR_NODE_LINK_TO(matched.at("reduce_mean_input"), new_op_node); + IR_NODE_LINK_TO(new_op_node, matched.at("out")); + } + + private: + cpp::OpDesc GenOpDesc(const key2nodes_t& matched) { + cpp::OpDesc op_desc = *matched.at("reduce_mean")->stmt()->op_info(); + op_desc.mutable_inputs()->clear(); + op_desc.mutable_outputs()->clear(); + op_desc.SetType("__xpu__sfa_head"); + op_desc.SetInput("Input", {matched.at("reduce_mean_input")->arg()->name}); + op_desc.SetOutput("Output", {matched.at("out")->arg()->name}); + op_desc.SetAttr("op_type", std::string("moment")); + return op_desc; + } +}; + +} // namespace fusion + +class XPUSfaHeadMomentFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) { + return; + } + + fusion::XPUSfaHeadMomentFuser fuser; + fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__sfa_head_moment_fuse_pass, + paddle::lite::mir::XPUSfaHeadMomentFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("reduce_mean"); diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index 98b1597b49b9a7e151c86d11843e45163890191a..c68bd1161457eca2e7f280e895b8e5aee2498fc8 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -122,7 +122,15 @@ std::string Visualize(mir::SSAGraph* graph) { dot.AddNode(var_name, {}); exists_var_names.insert(var_name); } - dot.AddEdge(var_name, op_name, {}); + std::vector attrs; + std::string arg_name; + if (op_info->GetInputArgname(var_name, &arg_name)) { + attrs.emplace_back("label", arg_name); + } else { + VLOG(5) << "Can not find the input argument for var " << var_name + << " in " << op_type; + } + dot.AddEdge(var_name, op_name, attrs); } for (auto& x : node->outlinks) { std::string var_name; @@ -136,7 +144,15 @@ std::string Visualize(mir::SSAGraph* graph) { dot.AddNode(var_name, {}); exists_var_names.insert(var_name); } - dot.AddEdge(op_name, var_name, {}); + std::vector attrs; + std::string arg_name; + if (op_info->GetOutputArgname(var_name, &arg_name)) { + attrs.emplace_back("label", arg_name); + } else { + VLOG(5) << "Can not find the output argument for var " << var_name + << " in " << op_type; + } + dot.AddEdge(op_name, var_name, attrs); } // Output its all of attributes(name and values) os << "* " << op_name << "\n"; diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 6b18e929c077699a723b9dd9db313370d061cbb8..2dfc444a26ffe013ad05c81a003dd073cc133177 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -109,6 +109,10 @@ class Optimizer { "identity_dropout_eliminate_pass", "__xpu__resnet_fuse_pass", "__xpu__resnet_cbam_fuse_pass", + "__xpu__conv2d_fuse_pass", + "__xpu__conv2d_link_previous_out_max_pass", + "__xpu__sfa_head_meanstd_fuse_pass", + "__xpu__sfa_head_moment_fuse_pass", "__xpu__mmdnn_fuse_pass", "__xpu__multi_encoder_fuse_pass", "__xpu__embedding_with_eltwise_add_fuse_pass", diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index c3190716d1d7937933b83516b784d7128084227e..864f2938af6aefd57185a61831e067d56908a892 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -90,8 +90,6 @@ add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute. add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm) diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index 99e4739ca0350e725458cd77721f9312974584b0..0cc7b5b302907abbe2e8d2ebadbb3a358cc998d9 100644 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -8,6 +8,8 @@ add_kernel(unsqueeze_compute_host Host basic SRCS unsqueeze_compute.cc DEPS ${li add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps}) add_kernel(expand_as_compute_host Host basic SRCS expand_as_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(fill_constant_compute_host Host basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(fill_constant_batch_size_like_compute_host Host basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps}) add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps}) add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps}) diff --git a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc b/lite/kernels/host/fill_constant_batch_size_like_compute.cc similarity index 84% rename from lite/kernels/arm/fill_constant_batch_size_like_compute.cc rename to lite/kernels/host/fill_constant_batch_size_like_compute.cc index 3a8a09020f11e9cc84dc4891512b6581372e7085..13725eb707778cd04fc386a2c92f6199cee3860a 100644 --- a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc +++ b/lite/kernels/host/fill_constant_batch_size_like_compute.cc @@ -12,16 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/fill_constant_batch_size_like_compute.h" +#include "lite/kernels/host/fill_constant_batch_size_like_compute.h" namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { void FillConstantBatchSizeLikeCompute::Run() { auto& param = *param_.get_mutable(); - auto& context = ctx_->As(); if (param.dtype == static_cast(lite::core::FluidType::FP32)) { auto data = param.out->template mutable_data(); @@ -50,18 +49,18 @@ void FillConstantBatchSizeLikeCompute::Run() { } } -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle REGISTER_LITE_KERNEL( fill_constant_batch_size_like, - kARM, + kHost, kAny, kNCHW, - paddle::lite::kernels::arm::FillConstantBatchSizeLikeCompute, + paddle::lite::kernels::host::FillConstantBatchSizeLikeCompute, def) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/fill_constant_batch_size_like_compute.h b/lite/kernels/host/fill_constant_batch_size_like_compute.h similarity index 91% rename from lite/kernels/arm/fill_constant_batch_size_like_compute.h rename to lite/kernels/host/fill_constant_batch_size_like_compute.h index 23aa64bb6417ae1ed0b551520096cf6401ec702c..b6f63fc2d6401a7705c04725ed12ad622ed9a728 100644 --- a/lite/kernels/arm/fill_constant_batch_size_like_compute.h +++ b/lite/kernels/host/fill_constant_batch_size_like_compute.h @@ -19,10 +19,10 @@ namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { class FillConstantBatchSizeLikeCompute - : public KernelLite { + : public KernelLite { public: using param_t = operators::FillConstantBatchSizeLikeParam; @@ -31,7 +31,7 @@ class FillConstantBatchSizeLikeCompute ~FillConstantBatchSizeLikeCompute() {} }; -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/host/fill_constant_compute.cc similarity index 81% rename from lite/kernels/arm/fill_constant_compute.cc rename to lite/kernels/host/fill_constant_compute.cc index 3d8fb9aee83dcaaa39bc94e98e8487c1bf0bf15c..61ef26a5a9c405904b873ccbb72eb01be27a4f3a 100644 --- a/lite/kernels/arm/fill_constant_compute.cc +++ b/lite/kernels/host/fill_constant_compute.cc @@ -12,16 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/fill_constant_compute.h" +#include "lite/kernels/host/fill_constant_compute.h" namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { void FillConstantCompute::Run() { auto& param = *param_.get_mutable(); - auto& context = ctx_->As(); if (param.dtype == static_cast(lite::core::FluidType::FP32)) { auto data = param.out->template mutable_data(); @@ -50,21 +49,21 @@ void FillConstantCompute::Run() { } } -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle // float REGISTER_LITE_KERNEL(fill_constant, - kARM, + kHost, kAny, kNCHW, - paddle::lite::kernels::arm::FillConstantCompute, + paddle::lite::kernels::host::FillConstantCompute, def) .BindInput("ShapeTensor", - {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) .BindInput("ShapeTensorList", - {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) .Finalize(); diff --git a/lite/kernels/arm/fill_constant_compute.h b/lite/kernels/host/fill_constant_compute.h similarity index 88% rename from lite/kernels/arm/fill_constant_compute.h rename to lite/kernels/host/fill_constant_compute.h index 7717c4c2628cff5358cc2011c01cb4b02ee125dc..7a2450d41751c7ccd9a865ed07f27f72bb60a1de 100644 --- a/lite/kernels/arm/fill_constant_compute.h +++ b/lite/kernels/host/fill_constant_compute.h @@ -19,9 +19,9 @@ namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { -class FillConstantCompute : public KernelLite { +class FillConstantCompute : public KernelLite { public: using param_t = operators::FillConstantParam; @@ -30,7 +30,7 @@ class FillConstantCompute : public KernelLite { ~FillConstantCompute() {} }; -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index fdb485df02f366f7f4868965b1f20c6861b03d43..29f14c8f3ea10a26f737211e4702103239272853 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -24,6 +24,9 @@ else() add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps}) add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps}) add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(reshape_compute_xpu XPU basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(reduce_mean_compute_xpu XPU basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(reduce_sum_compute_xpu XPU basic SRCS reduce_sum_compute.cc DEPS ${lite_kernel_deps}) # extra add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) @@ -44,4 +47,6 @@ else() add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__conv2d_compute_xpu XPU extra SRCS __xpu__conv2d_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__sfa_head_compute_xpu XPU extra SRCS __xpu__sfa_head_compute.cc DEPS ${lite_kernel_deps}) endif() diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d73832937cf0e5f83d9e82ca769ddcd86e06cad --- /dev/null +++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__conv2d_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUConv2dCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& input_dims = param.Input->dims(); + auto& filter_dims = param.Filter->dims(); + int batch = static_cast(input_dims[0]); + int img_c = static_cast(input_dims[1]); + int img_h = static_cast(input_dims[2]); + int img_w = static_cast(input_dims[3]); + int filter_num = static_cast(filter_dims[0]); + int win_h = static_cast(filter_dims[2]); + int win_w = static_cast(filter_dims[3]); + + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int stride_h = param.strides[0]; + int stride_w = param.strides[1]; + int paddings_h = paddings[0]; + int paddings_w = paddings[1]; + int dilations_h = dilations[0]; + int dilations_w = dilations[1]; + + std::string filter_type = param.filter_type; + int groups = param.groups; + + int act_type = (param.act_type == -1) ? xdnn::Activation_t::RELU + : param.act_type; // -1 means not init + const auto* bias = param.Bias ? param.Bias->data() : nullptr; + const auto* branch = param.Branch ? param.Branch->data() : nullptr; + const float* input_max = + param.InputMax ? param.InputMax->data() : nullptr; + float* output_max = param.OutputMax + ? param.OutputMax->mutable_data(TARGET(kXPU)) + : nullptr; + float* output = param.Output->mutable_data(TARGET(kXPU)); + + // TODO(luohang): now support for resnet50 first + CHECK_EQ(act_type, xdnn::Activation_t::RELU); + CHECK_EQ(groups, 1); + CHECK_EQ(filter_type, "int16"); + + xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type); + int r = xdnn::conv2d_forward_int16( + ctx.GetRawContext(), /* context */ + batch, /* batch */ + img_c, /* input_c */ + img_h, /* input_h */ + img_w, /* input_w */ + filter_num, /* num_filter */ + win_h, /* kernel_h */ + win_w, /* kernel_w */ + stride_h, /* stride_h */ + stride_w, /* stride_w */ + paddings_h, /* pad_h */ + paddings_w, /* pad_w */ + dilations_h, /* dilation_h */ + dilations_w, /* dilation_w */ + groups, /* group */ + param.Input->data(), /* input bottom */ + param.Filter->data(), /* filter weight */ + output, /* output top */ + bias, /* bias */ + branch, /* branch */ + act, /* act type */ + input_max, /* max_image_ptr */ + param.FilterMax->data(), /* max_filter_ptr */ + output_max /* max_result_ptr */); + + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUConv2dCompute, + def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("FilterMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.h b/lite/kernels/xpu/__xpu__conv2d_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..0d3b3217c589948d0515eb410e7ee70e1f2b028c --- /dev/null +++ b/lite/kernels/xpu/__xpu__conv2d_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUConv2dCompute : public KernelLite { + public: + using param_t = operators::XPUConv2dParam; + + virtual void Run(); + + virtual ~XPUConv2dCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/__xpu__sfa_head_compute.cc b/lite/kernels/xpu/__xpu__sfa_head_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..9721e4e13377eab236a775b0301b7dfac1e15752 --- /dev/null +++ b/lite/kernels/xpu/__xpu__sfa_head_compute.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__sfa_head_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUSfaHeadCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + std::string vis_type = param.op_type; + auto input = param.input; + + const int batch = static_cast(input->dims()[0]); + const int m = static_cast(input->dims()[1]); + const int n = static_cast(input->dims()[2]); + if (vis_type == "meanstd") { + int r = xdnn::vis_meanstd(ctx.GetRawContext(), + param.input->data(), + param.output->mutable_data(TARGET(kXPU)), + batch, + m, + n); + CHECK_EQ(r, 0) << "XPU kernel error"; + (void)param.output->mutable_data(); + } else if (vis_type == "moment") { + int r = xdnn::vis_moment(ctx.GetRawContext(), + param.input->data(), + param.output->mutable_data(TARGET(kXPU)), + batch, + m, + n); + CHECK_EQ(r, 0) << "XPU kernel error"; + } else { + LOG(FATAL) << "vis xpu op not supported type " << vis_type.c_str(); + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__sfa_head, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUSfaHeadCompute, + def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__sfa_head_compute.h b/lite/kernels/xpu/__xpu__sfa_head_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d869ae2ebf3f50bcb70294986d423dade0e78458 --- /dev/null +++ b/lite/kernels/xpu/__xpu__sfa_head_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUSfaHeadCompute : public KernelLite { + public: + using param_t = operators::XPUSfaHeadParam; + + virtual void Run(); + + virtual ~XPUSfaHeadCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc index a46b33252e40a56299ebc7d0f133520a04b7cb20..fa20cbd60b37a0ebcc1c708daefcfff316465227 100644 --- a/lite/kernels/xpu/activation_compute.cc +++ b/lite/kernels/xpu/activation_compute.cc @@ -60,6 +60,71 @@ void SigmoidCompute::Run() { CHECK_EQ(r, 0); } +void AbsCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::ABS, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +void SquareCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::SQUARE, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +void SqrtCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::SQRT, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +void PowCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::ACT_POW, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +void SignCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::SIGN, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + } // namespace xpu } // namespace kernels } // namespace lite @@ -86,3 +151,33 @@ REGISTER_LITE_KERNEL(sigmoid, .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL( + abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + square, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SquareCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + sqrt, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SqrtCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + pow, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::PowCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + sign, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SignCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h index f2ad667886ac33191687b70aa7548050461545e7..df4a5d3f8d9cbebdc3ac63a91602b370b48ee629 100644 --- a/lite/kernels/xpu/activation_compute.h +++ b/lite/kernels/xpu/activation_compute.h @@ -48,6 +48,51 @@ class SigmoidCompute : public KernelLite { virtual ~SigmoidCompute() = default; }; +class AbsCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~AbsCompute() = default; +}; + +class SquareCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~SquareCompute() = default; +}; + +class SqrtCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~SqrtCompute() = default; +}; + +class PowCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~PowCompute() = default; +}; + +class SignCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~SignCompute() = default; +}; + } // namespace xpu } // namespace kernels } // namespace lite diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc index e37337948bf639832ea936de2b5b929d26f534cc..b7d3588a3ed18589c6ec7601992b7ba468842429 100644 --- a/lite/kernels/xpu/elementwise_compute.cc +++ b/lite/kernels/xpu/elementwise_compute.cc @@ -76,6 +76,59 @@ void ElementwiseSubCompute::Run() { } } +void ElementwiseDivCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.X->dims().data(); + auto& y_dims = param.Y->dims(); + int axis = param.axis; + if (param.axis == -1) { + axis = x_dims.size() - y_dims.size(); + } + int iter = std::accumulate( + x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies()); + int stride = param.Y->numel(); + + for (int i = 0; i < iter; ++i) { + const float* x_ptr = param.X->data() + i * stride; + const float* y_ptr = param.Y->data(); + float* o_ptr = param.Out->mutable_data(TARGET(kXPU)) + i * stride; + int r = xdnn::elementwise_div(ctx.GetRawContext(), /* context */ + x_ptr, /* x */ + y_ptr, /* y */ + o_ptr, /* z */ + stride /* len */); + CHECK_EQ(r, 0); + } +} + +void ElementwiseMulCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.X->dims().data(); + auto& y_dims = param.Y->dims(); + int axis = param.axis; + if (param.axis == -1) { + axis = x_dims.size() - y_dims.size(); + } + int iter = std::accumulate( + x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies()); + int stride = param.Y->numel(); + + for (int i = 0; i < iter; ++i) { + const float* x_ptr = param.X->data() + i * stride; + const float* y_ptr = param.Y->data(); + float* o_ptr = param.Out->mutable_data(TARGET(kXPU)) + i * stride; + int r = xdnn::elementwise_mul(ctx.GetRawContext(), /* context */ + x_ptr, /* x */ + y_ptr, /* y */ + o_ptr, /* z */ + stride /* len */); + CHECK_EQ(r, 0); + } +} } // namespace xpu } // namespace kernels } // namespace lite @@ -102,3 +155,25 @@ REGISTER_LITE_KERNEL(elementwise_sub, .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_div, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ElementwiseDivCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ElementwiseMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h index d910b9293e74428c426d9505245bc5958fc9df3a..6cf75486d791bfc69fe9ba6d4b54e89cbbb56ff5 100644 --- a/lite/kernels/xpu/elementwise_compute.h +++ b/lite/kernels/xpu/elementwise_compute.h @@ -41,6 +41,26 @@ class ElementwiseSubCompute virtual ~ElementwiseSubCompute() = default; }; +class ElementwiseDivCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + virtual void Run(); + + virtual ~ElementwiseDivCompute() = default; +}; + +class ElementwiseMulCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + virtual void Run(); + + virtual ~ElementwiseMulCompute() = default; +}; + } // namespace xpu } // namespace kernels } // namespace lite diff --git a/lite/kernels/xpu/reduce_mean_compute.cc b/lite/kernels/xpu/reduce_mean_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e2127c8a928f4587c093877477e9c155399284f --- /dev/null +++ b/lite/kernels/xpu/reduce_mean_compute.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/reduce_mean_compute.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ReduceMeanCompute::Run() { + auto& param = Param(); + auto& ctx = this->ctx_->As(); + const float* input = param.X->data(); + auto x_dims = param.X->dims(); + int x_rank = x_dims.size(); + float* output = param.Out->mutable_data(TARGET(kXPU)); + auto reduce_dim = param.dim; + + std::vector idims; + for (int i = 0; i < x_rank; i++) { + idims.push_back(x_dims[i]); + } + + auto type = xdnn::ReduceOp::REDUCE_MEAN; + int r = xdnn::reduce(ctx.GetRawContext(), + input, + output, + idims.data(), + x_rank, + reduce_dim.data(), + reduce_dim.size(), + type); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(reduce_mean, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReduceMeanCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/reduce_mean_compute.h b/lite/kernels/xpu/reduce_mean_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8bd1033122c7f7da325cc938a08cb550bb71eeb3 --- /dev/null +++ b/lite/kernels/xpu/reduce_mean_compute.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class ReduceMeanCompute : public KernelLite { + public: + void Run() override; + + virtual ~ReduceMeanCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/reduce_sum_compute.cc b/lite/kernels/xpu/reduce_sum_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..27dc18b3fa553fcf4edef697083b02726c9d2e58 --- /dev/null +++ b/lite/kernels/xpu/reduce_sum_compute.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/reduce_sum_compute.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ReduceSumCompute::Run() { + auto& param = Param(); + auto& ctx = this->ctx_->As(); + const float* input = param.x->data(); + float* output = param.output->mutable_data(TARGET(kXPU)); + bool reduce_all = param.reduce_all; + + if (reduce_all) { + int input_len = param.x->numel(); + int r = xdnn::sum(ctx.GetRawContext(), input, output, input_len); + CHECK_EQ(r, 0); + } else { + auto x_dims = param.x->dims(); + int x_rank = x_dims.size(); + auto reduce_dim = param.dim; + auto rdim = reduce_dim.size(); + + std::vector idims; + for (int i = 0; i < x_rank; i++) { + idims.push_back(x_dims[i]); + } + + auto type = xdnn::ReduceOp::REDUCE_SUM; + int r = xdnn::reduce(ctx.GetRawContext(), + input, + output, + idims.data(), + x_rank, + reduce_dim.data(), + rdim, + type); + CHECK_EQ(r, 0); + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(reduce_sum, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReduceSumCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/reduce_sum_compute.h b/lite/kernels/xpu/reduce_sum_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..28d9f8eb7b438e390928b82f83e623e9d19d8f47 --- /dev/null +++ b/lite/kernels/xpu/reduce_sum_compute.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class ReduceSumCompute : public KernelLite { + public: + void Run() override; + + virtual ~ReduceSumCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/reshape_compute.cc b/lite/kernels/xpu/reshape_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..c428d5d1dfd408dc184bee32966c293da4f4e99b --- /dev/null +++ b/lite/kernels/xpu/reshape_compute.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/reshape_compute.h" +#include "lite/core/op_registry.h" + +REGISTER_LITE_KERNEL(reshape2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::Reshape2Compute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/reshape_compute.h b/lite/kernels/xpu/reshape_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..289fbf4120eceea012723995accf3eea3ab21268 --- /dev/null +++ b/lite/kernels/xpu/reshape_compute.h @@ -0,0 +1,51 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class Reshape2Compute : public KernelLite { + public: + using param_t = operators::ReshapeParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto x = param.x; + auto output = param.output; + auto xshape = param.xshape; + auto x_dims = x->dims(); + auto x_dims_data = x_dims.Vectorize(); + auto out_dims = output->dims(); + output->ShareDataWith(*x); + output->Resize(out_dims); + auto* xshape_data = xshape->mutable_data(TARGET(kXPU)); + TargetWrapperXPU::MemcpySync(xshape_data, + x_dims_data.data(), + x_dims.size() * sizeof(int64_t), + IoDirection::HtoD); + } + + virtual ~Reshape2Compute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 2099958960050769c0d9c5c6df2f074919d3d701..02377aad498a47cff50c3a595f6fb1634a56b5ff 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -53,6 +53,8 @@ add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS}) add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS}) add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS}) add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS}) +add_operator(pow_op extra SRCS pow_op.cc DEPS ${op_DEPS}) +add_operator(sign_op extra SRCS sign_op.cc DEPS ${op_DEPS}) # 2.basic ops not used in basic models add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) @@ -177,6 +179,9 @@ add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS}) add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS}) add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS}) add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__conv2d_op extra SRCS __xpu__conv2d_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__sfa_head_op extra SRCS __xpu__sfa_head_op.cc DEPS ${op_DEPS}) + if (NOT LITE_WITH_X86) lite_cc_test(test_one_hot_op SRCS one_hot_op_test.cc DEPS one_hot_op memory scope ${op_deps} one_hot_compute_host) lite_cc_test(test_fc_op SRCS fc_op_test.cc diff --git a/lite/operators/__xpu__conv2d_op.cc b/lite/operators/__xpu__conv2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dff4d5e6dadf9bce15e76f5b353611f402eee19a --- /dev/null +++ b/lite/operators/__xpu__conv2d_op.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__conv2d_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/operators/conv_op.h" + +namespace paddle { +namespace lite { +namespace operators { + +std::string padding_algorithm_ = ""; // NOLINT + +bool XPUConv2dOp::CheckShape() const { + CHECK(param_.Input) << "Input(Input) of ConvXPUOp should not be null."; + CHECK(param_.Output) << "Input(Filter) of ConvXPUOp should not be null."; + CHECK(param_.Filter) << "Output(Output) of ConvXPUOp should not be null."; + // bias is optional. + + const auto in_dims = param_.Input->dims(); + const auto filter_dims = param_.Filter->dims(); + int groups = param_.groups; + + CHECK_EQ(in_dims.size(), 4UL) << "Conv intput should be 4-D tensor."; + CHECK_EQ(in_dims.size(), filter_dims.size()) + << "Conv input dimension and filter dimension should be the same."; + CHECK_EQ(in_dims.size() - param_.strides.size(), 2U) + << "Conv input dimension and strides dimension should be consistent."; + CHECK_EQ(filter_dims.size(), 4UL) << "Conv filter should be 4-D tensor."; + CHECK_EQ(in_dims[1], filter_dims[1] * groups) + << "The number of input channels should be equal to filter channels * " + "groups."; + CHECK_EQ(filter_dims[0] % groups, 0) + << "The number of output channels should be divided by groups."; + + return true; +} + +// copy from conv_op.cc +inline int ConvOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = + (input_size + (pad_left + pad_right) - dkernel) / stride + 1; + + return output_size; +} + +// copy from conv_op.cc +bool XPUConv2dOp::InferShapeImpl() const { + const auto in_dims = param_.Input->dims(); + const auto filter_dims = param_.Filter->dims(); + + operators::UpdatePaddingAndDilation(param_.paddings.get(), + param_.dilations.get(), + param_.strides, + padding_algorithm_, + in_dims, + filter_dims); + std::vector output_shape({in_dims[0], filter_dims[0]}); + auto paddings = *param_.paddings; + auto dilations = *param_.dilations; + for (size_t i = 0; i < param_.strides.size(); ++i) { + output_shape.push_back(ConvOutputSize(in_dims[i + 2], + filter_dims[i + 2], + dilations[i], + paddings[i * 2], + paddings[i * 2 + 1], + param_.strides[i])); + } + + // Set output and output max dims + param_.Output->Resize(lite::DDim(output_shape)); + param_.OutputMax->Resize({4}); + // share LoD + param_.Output->set_lod(param_.Input->lod()); + + return true; +} + +bool XPUConv2dOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { + AttachParam(¶m_); + CHECK(scope->FindVar(op_desc.Input("Input").front())); + CHECK(scope->FindVar(op_desc.Input("Filter").front())); + CHECK(scope->FindVar(op_desc.Input("FilterMax").front())); + CHECK(scope->FindVar(op_desc.Output("Output").front())); + CHECK(scope->FindVar(op_desc.Output("OutputMax").front())); + + param_.Input = + scope->FindVar(op_desc.Input("Input").front())->GetMutable(); + param_.Filter = + scope->FindVar(op_desc.Input("Filter").front())->GetMutable(); + param_.FilterMax = + scope->FindVar(op_desc.Input("FilterMax").front())->GetMutable(); + auto bias = scope->FindVar(op_desc.Input("Bias").front()); + if (bias != nullptr) { + param_.Bias = bias->GetMutable(); + } + // optional params + std::vector input_arg_names = op_desc.InputArgumentNames(); + if (std::find(input_arg_names.begin(), input_arg_names.end(), "Branch") != + input_arg_names.end()) { + auto arguments = op_desc.Input("Branch"); + if (arguments.size() > 0) { + auto arg_var = scope->FindVar(arguments.front()); + if (arg_var != nullptr) { + param_.Branch = + const_cast(&(arg_var->Get())); + } + } + } + + param_.Output = + scope->FindVar(op_desc.Output("Output").front())->GetMutable(); + param_.OutputMax = + scope->FindVar(op_desc.Output("OutputMax").front())->GetMutable(); + + param_.strides = op_desc.GetAttr>("strides"); + auto paddings = op_desc.GetAttr>("paddings"); + auto dilations = op_desc.GetAttr>("dilations"); + param_.dilations = std::make_shared>(dilations); + param_.groups = op_desc.GetAttr("groups"); + if (op_desc.HasAttr("act_type")) { + param_.act_type = op_desc.GetAttr("act_type"); + } + + if (op_desc.HasAttr("filter_type")) { + param_.filter_type = op_desc.GetAttr("filter_type"); + } else { + param_.filter_type = "int16"; + } + + if (op_desc.HasAttr("has_input_max") && + op_desc.GetAttr("has_input_max")) { + CHECK(scope->FindVar(op_desc.Input("InputMax").front())); + param_.InputMax = + scope->FindVar(op_desc.Input("InputMax").front())->GetMutable(); + } + + if (op_desc.HasAttr("padding_algorithm")) { + padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); + } + + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < param_.strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the input size."; + } + } + param_.paddings = std::make_shared>(paddings); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__conv2d, paddle::lite::operators::XPUConv2dOp); diff --git a/lite/operators/__xpu__conv2d_op.h b/lite/operators/__xpu__conv2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f3141a594148767a9dbe4c01f496e78f9d3ca5d2 --- /dev/null +++ b/lite/operators/__xpu__conv2d_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUConv2dOp : public OpLite { + public: + XPUConv2dOp() {} + + explicit XPUConv2dOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUConv2d"; } + + private: + mutable XPUConv2dParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/__xpu__sfa_head_op.cc b/lite/operators/__xpu__sfa_head_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f04cfa51392e6fd1099f1c9a57de10775e61507c --- /dev/null +++ b/lite/operators/__xpu__sfa_head_op.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__sfa_head_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUSfaHeadOp::CheckShape() const { + CHECK_OR_FALSE(param_.input); + CHECK_OR_FALSE(param_.output); + CHECK_OR_FALSE(param_.op_type != ""); + + const auto input_dims = param_.input->dims(); + if (param_.op_type == "meanstd" || param_.op_type == "moment") { + CHECK_EQ_OR_FALSE(input_dims.size(), 3UL); + } + + return true; +} + +bool XPUSfaHeadOp::InferShapeImpl() const { + const auto& input_dims = param_.input->dims(); + auto op_type = param_.op_type; + + // Set output dims + std::vector output_dims(2); + output_dims[0] = input_dims[0]; + if (op_type == "meanstd") { + output_dims[1] = 2 * input_dims[1]; + } else if (op_type == "moment") { + output_dims[1] = 4 * input_dims[1]; + } else { + LOG(FATAL) << "not supported vis op --> " << op_type; + } + param_.output->Resize(output_dims); + + // share LoD + param_.output->set_lod(param_.input->lod()); + return true; +} + +bool XPUSfaHeadOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { + auto input = op_desc.Input("Input").front(); + auto output = op_desc.Output("Output").front(); + CHECK(scope->FindVar(input)); + CHECK(scope->FindVar(output)); + + param_.input = scope->FindVar(input)->GetMutable(); + param_.output = scope->FindVar(output)->GetMutable(); + param_.op_type = op_desc.GetAttr("op_type"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__sfa_head, paddle::lite::operators::XPUSfaHeadOp); diff --git a/lite/operators/__xpu__sfa_head_op.h b/lite/operators/__xpu__sfa_head_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2ecfaf3cbaa1fefd2c3f8f3060e9c945ad185692 --- /dev/null +++ b/lite/operators/__xpu__sfa_head_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUSfaHeadOp : public OpLite { + public: + XPUSfaHeadOp() {} + + explicit XPUSfaHeadOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUFc"; } + + private: + mutable XPUSfaHeadParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 3e68bc1631bd41477fdbcbcfbbc6279287e21af1..586d3d1183b4049f8b49ef22f92e84412ed5522f 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -232,6 +232,20 @@ struct PowerParam : ParamBase { float power{}; }; +// For Pow Op +struct PowParam : ParamBase { + const lite::Tensor* X{}; + lite::Tensor* Out{}; + + float factor{1.}; +}; + +// For Sign Op +struct SignParam : ParamBase { + const lite::Tensor* X{}; + lite::Tensor* Out{}; +}; + struct ShuffleChannelParam : ParamBase { const lite::Tensor* X{}; lite::Tensor* Out{}; @@ -1810,6 +1824,31 @@ struct XPUMmdnnMergeAllParam : ParamBase { lite::Tensor* out{}; }; +struct XPUConv2dParam : ParamBase { + lite::Tensor* Input{nullptr}; + lite::Tensor* Filter{nullptr}; + lite::Tensor* InputMax{nullptr}; + lite::Tensor* FilterMax{nullptr}; + lite::Tensor* Bias{nullptr}; + lite::Tensor* Branch{nullptr}; + lite::Tensor* Output{nullptr}; + lite::Tensor* OutputMax{nullptr}; + + int groups{1}; + int act_type{-1}; + std::string filter_type{""}; + std::vector strides; + std::shared_ptr> paddings; + std::shared_ptr> dilations; +}; + +struct XPUSfaHeadParam : ParamBase { + lite::Tensor* input{nullptr}; + lite::Tensor* output{nullptr}; + + std::string op_type{""}; +}; + // For DeformableConvolution op struct DeformableConvParam : ParamBase { lite::Tensor* x{}; diff --git a/lite/operators/pow_op.cc b/lite/operators/pow_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d9c16bd67365760e312931b4c6371ead4c459f05 --- /dev/null +++ b/lite/operators/pow_op.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pow_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool PowOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool PowOp::InferShapeImpl() const { + param_.Out->Resize(param_.X->dims()); + return true; +} + +bool PowOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + auto X = op_desc.Input("X").front(); + auto Out = op_desc.Output("Out").front(); + CHECK(scope->FindVar(X)); + CHECK(scope->FindVar(Out)); + param_.X = scope->FindVar(X)->GetMutable(); + param_.Out = scope->FindVar(Out)->GetMutable(); + param_.factor = op_desc.GetAttr("factor"); + CHECK(param_.X); + CHECK(param_.Out); + + return true; +} + +} /* namespace operators */ +} /* namespace lite */ +} /* namespace paddle */ + +REGISTER_LITE_OP(pow, paddle::lite::operators::PowOp); diff --git a/lite/operators/pow_op.h b/lite/operators/pow_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9e8f97fd2b3af99177ad6f2a49b7c382b16443bc --- /dev/null +++ b/lite/operators/pow_op.h @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class PowOp : public OpLite { + public: + PowOp() {} + + explicit PowOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "pow"; } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + ch->input_shape = ch->DimToStr(param_.X->dims()); + ch->output_shape = ch->DimToStr(param_.Out->dims()); + ch->macs = param_.Out->numel(); + } +#endif + + private: + mutable PowParam param_; +}; + +} /* namespace operators */ +} /* namespace lite */ +} /* namespace paddle */ diff --git a/lite/operators/sign_op.cc b/lite/operators/sign_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c31f58315b44f9e60e3649419e09f142d362c0e --- /dev/null +++ b/lite/operators/sign_op.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sign_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SignOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool SignOp::InferShapeImpl() const { + param_.Out->Resize(param_.X->dims()); + return true; +} + +bool SignOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + auto X = op_desc.Input("X").front(); + auto Out = op_desc.Output("Out").front(); + CHECK(scope->FindVar(X)); + CHECK(scope->FindVar(Out)); + param_.X = scope->FindVar(X)->GetMutable(); + param_.Out = scope->FindVar(Out)->GetMutable(); + CHECK(param_.X); + CHECK(param_.Out); + + return true; +} + +} /* namespace operators */ +} /* namespace lite */ +} /* namespace paddle */ + +REGISTER_LITE_OP(sign, paddle::lite::operators::SignOp); diff --git a/lite/operators/sign_op.h b/lite/operators/sign_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f54038adc4792761a6edd090a5d9bc1506149be8 --- /dev/null +++ b/lite/operators/sign_op.h @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SignOp : public OpLite { + public: + SignOp() {} + + explicit SignOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "sign"; } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + ch->input_shape = ch->DimToStr(param_.X->dims()); + ch->output_shape = ch->DimToStr(param_.Out->dims()); + ch->macs = param_.Out->numel(); + } +#endif + + private: + mutable SignParam param_; +}; + +} /* namespace operators */ +} /* namespace lite */ +} /* namespace paddle */ diff --git a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc index 2318d53a33866fd8ba61d14c4d6bc6aed283dbdc..e7196fc04c08108e060f03619f303f349886b001 100644 --- a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc +++ b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc @@ -135,8 +135,8 @@ TEST(fill_constant_batch_size_like, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // use fp16 in npu -#elif defined(LITE_WITH_ARM) - place = TARGET(kARM); +#elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86) + place = TARGET(kHost); #else return; #endif diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc index bc2cfce7842c935898bd9ecddc6c2d0ac4c39af5..59a4c301ce4d7ea215b73709aaf07908b91c297f 100644 --- a/lite/tests/kernels/fill_constant_compute_test.cc +++ b/lite/tests/kernels/fill_constant_compute_test.cc @@ -174,8 +174,8 @@ TEST(fill_constant, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // use fp16 in npu -#elif defined(LITE_WITH_ARM) - place = TARGET(kARM); +#elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86) + place = TARGET(kHost); #else return; #endif