diff --git a/src/common/types.cpp b/src/common/types.cpp index e06e9965c4108988ed9e6675f7a012631e81049f..372331ad32244ca43ebad929b2918002f7fe42bd 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -23,6 +23,8 @@ const char *G_OP_TYPE_BOX_CODER = "box_coder"; const char *G_OP_TYPE_CONCAT = "concat"; const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; +const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; +const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; @@ -83,6 +85,8 @@ std::unordered_map< {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}, {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}}, diff --git a/src/common/types.h b/src/common/types.h index bab169977135ce4f572bf4242837ed39588cc97b..dcbea1132866d7c0dadfc8a5c308bf837f3abbcf 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -85,6 +85,8 @@ extern const char *G_OP_TYPE_BOX_CODER; extern const char *G_OP_TYPE_CONCAT; extern const char *G_OP_TYPE_ELEMENTWISE_ADD; extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU; +extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU; +extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; extern const char *G_OP_TYPE_FC; extern const char *G_OP_TYPE_FUSION_CONV_ADD; extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; diff --git a/src/framework/operator.h b/src/framework/operator.h index 084ac3c81185fe489fe1ca67589c1e8edb1d4fdf..5252ee65a2a80910500f4085bb92b80829f9e45b 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -183,6 +183,8 @@ class FusionOpMatcher { std::string BeginType() { return node_.Type(); } + virtual std::vector> NeedCheck() { return {}; } + // virtual bool Fusion(); protected: Node node_; diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp index a4e1db506da362df4fb61b39827d5e77ebc425eb..c8b15e67663c2e74901b499c0b244a03113891ff 100644 --- a/src/framework/program/program-optimize/node.cpp +++ b/src/framework/program/program-optimize/node.cpp @@ -14,15 +14,33 @@ limitations under the License. */ #include "framework/program/program-optimize/node.h" #include +#include +#include #include "framework/operator.h" namespace paddle_mobile { namespace framework { +std::vector Node::operator[](int index) { + std::vector nodes; + GetNodesWithLocation(index, 0, &nodes); + return nodes; +} + +void Node::GetNodesWithLocation(int index, int now_index, + std::vector *nodes) { + if (index == now_index) { + nodes->push_back(this); + } + + for (int i = 0; i < this->outputs_.size(); ++i) { + this->outputs_[i]->GetNodesWithLocation(index, now_index + 1, nodes); + } +} + Node &Node::operator>(std::shared_ptr node) { outputs_.push_back(node); - std::shared_ptr this_node; node->inputs_.push_back(this); return *node; } @@ -31,19 +49,70 @@ bool Node::operator==(const Node &in) { if (in.type_ == this->type_) { if (this->outputs_.size() == in.outputs_.size()) { for (int i = 0; i < outputs_.size(); ++i) { - if (!(*outputs_[i] == *in.outputs_[i])) { + if (!(this->outputs_[i]->MedianEqual(*in.outputs_[i]))) { + return false; + } + } + } else { + return false; + } + } else { + return false; + } + return true; +} + +bool Node::MedianEqual(const Node &in) { + if (in.type_ == this->type_) { + if (this->outputs_.size() == in.outputs_.size()) { + // if (this->inputs_.size() != in.inputs_.size()) { + // DLOG << " == - this input size: " << this->inputs_.size(); + // DLOG << " == - ptr of this " << this; + // DLOG << " == - in input size: " << in.inputs_.size(); + // DLOG << " == - input size not equal "; + // return false; + // } else { + // for (int i = 0; i < this->inputs_.size(); ++i) { + // if (this->inputs_[i]->type_ != in.inputs_[i]->type_) { + // DLOG << " == - input type not equal "; + // return false; + // } + // } + // } + + for (int i = 0; i < outputs_.size(); ++i) { + if (!((*outputs_[i]).MedianEqual(*in.outputs_[i]))) { return false; } } } else { + // DLOG << " == - output size not equal "; return false; } } else { + // DLOG << " == - median type is not equal "; return false; } return true; } +std::map Node::Relationship() { + std::map map; + RelationshipPrivate(&map); + return map; +} + +void Node::RelationshipPrivate(std::map *map) { + for (auto output : op_desc_->outputs_) { + for (auto output_key : output.second) { + (*map)[output_key] = this; + } + } + for (auto output : this->outputs_) { + output->RelationshipPrivate(map); + } +} + std::shared_ptr Node::To(int size) { std::shared_ptr node = std::make_shared(); this->To(size - 1, node); @@ -51,7 +120,9 @@ std::shared_ptr Node::To(int size) { } void Node::To(int index, std::shared_ptr node) { + node->op_desc_ = this->op_desc_; node->type_ = this->type_; + node->inputs_ = this->inputs_; if (index != 0) { } else { return; @@ -100,8 +171,19 @@ void Node::Folder( if (change->find(this->type_) != change->end()) { auto change_pairs = (*change)[this->type_]; for (const auto &change_pair : change_pairs) { - op_desc->GetInputs()[change_pair.second] = - this->op_desc_->GetInputs()[change_pair.first]; + std::map f; + if (this->op_desc_->GetInputs().find(change_pair.first) != + this->op_desc_->GetInputs().end()) { + if (op_desc->GetInputs().find(change_pair.second) != + op_desc->GetInputs().end()) { + for (auto value : this->op_desc_->GetInputs()[change_pair.first]) { + op_desc->GetInputs()[change_pair.second].push_back(value); + } + } else { + op_desc->GetInputs()[change_pair.second] = + this->op_desc_->GetInputs()[change_pair.first]; + } + } } } @@ -110,7 +192,28 @@ void Node::Folder( } if (index > 0) { --index; + for (auto output : outputs_) { + if (change->find(this->type_) != change->end()) { + auto change_pairs = (*change)[this->type_]; + for (const auto &change_pair : change_pairs) { + std::map f; + if (this->op_desc_->GetOutputs().find(change_pair.first) != + this->op_desc_->GetOutputs().end()) { + if (op_desc->GetInputs().find(change_pair.second) != + op_desc->GetInputs().end()) { + for (auto value : + this->op_desc_->GetOutputs()[change_pair.first]) { + op_desc->GetInputs()[change_pair.second].push_back(value); + } + } else { + op_desc->GetInputs()[change_pair.second] = + this->op_desc_->GetOutputs()[change_pair.first]; + } + } + } + } + removed_nodes->push_back(output); output->Folder(op_desc, outputs, index, change, begin_node, removed_nodes); diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h index 7eb179c243c28fe2668c3cf2f8f28f81312c0988..b86fc96a67e290540c94487497fced55abf09041 100644 --- a/src/framework/program/program-optimize/node.h +++ b/src/framework/program/program-optimize/node.h @@ -35,6 +35,7 @@ class Node { : op_desc_(op_desc), type_(op_desc->Type()) {} Node &operator>(std::shared_ptr node); bool operator==(const Node &in); + bool MedianEqual(const Node &in); #ifdef PADDLE_MOBILE_DEBUG std::string ToString() const; @@ -50,7 +51,14 @@ class Node { std::shared_ptr OpDescOfNode() { return op_desc_; } std::string Type() { return type_; } + std::vector operator[](int index); + + std::map Relationship(); + private: + void RelationshipPrivate(std::map *map); + void GetNodesWithLocation(int index, int now_index, + std::vector *nodes); void To(int index, std::shared_ptr); void Folder( std::shared_ptr op_desc, diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp index 82d33bc65d864e010fbe41b270b71ed98a21b33e..ed523a985138e5cb7cbd0b30fad77ba08c29223f 100644 --- a/src/framework/program/program-optimize/program_optimize.cpp +++ b/src/framework/program/program-optimize/program_optimize.cpp @@ -29,9 +29,15 @@ std::shared_ptr ProgramOptimize::FusionOptimize( for (int i = 0; i < optimize_program->Blocks().size(); ++i) { std::unordered_map> output_nodes; - std::unordered_map>> + std::unordered_map< + std::string, + std::vector< + std::pair, + std::unordered_map>>>> type_map; + std::unordered_map output_has; + std::vector> nodes; std::shared_ptr begin_node; @@ -50,7 +56,7 @@ std::shared_ptr ProgramOptimize::FusionOptimize( nodes.push_back(node); // - type_map[op->Type()].push_back(node); + type_map[op->Type()].push_back({node, output_nodes}); if (j == 0) { begin_node = node; @@ -69,6 +75,7 @@ std::shared_ptr ProgramOptimize::FusionOptimize( } auto output_keys = op_input_output_key.at(op_type).second; + for (auto output_key : output_keys) { auto op_outputs = op->Output(output_key); for (int k = 0; k < op_outputs.size(); ++k) { @@ -83,11 +90,42 @@ std::shared_ptr ProgramOptimize::FusionOptimize( auto match_vector = type_map[matcher->BeginType()]; - for (auto &match_node : match_vector) { + for (auto &match_node_pair : match_vector) { + auto match_node = match_node_pair.first; + + auto node_has = match_node_pair.second; + auto depth = matcher->BeginNode().Depth(); auto sub_node = match_node->To(depth); // DLOG << " sub node: " << *sub_node; if (*sub_node == matcher->BeginNode()) { + bool can_folder = true; + + auto relationship_map = sub_node->Relationship(); + + for (auto to_check : matcher->NeedCheck()) { + // if (node_has) + auto nodes = (*sub_node)[to_check.first]; + for (auto node : nodes) { + auto inputs_to_check = + node->OpDescOfNode()->Input(to_check.second); + + for (auto input_to_check : inputs_to_check) { + if (node_has.find(input_to_check) == node_has.end()) { + if (relationship_map.find(input_to_check) == + relationship_map.end()) { + can_folder = false; + } else { + } + } + } + } + } + + if (!can_folder) { + continue; + } + // DLOG << " match success " << " fusion node: \n" << // matcher->BeginNode() << "\nsub node: \n" << *sub_node; // DLOG << "match node\n"<< *match_node; @@ -95,11 +133,13 @@ std::shared_ptr ProgramOptimize::FusionOptimize( std::vector> removed_nodes; matcher->FolderNodes(match_node.get(), &removed_nodes); - for (int j = 0; j < removed_nodes.size(); ++j) { - auto removed_node = removed_nodes[j]; + for (int k = removed_nodes.size() - 1; k >= 0; --k) { + auto removed_node = removed_nodes[k]; auto removed_ite = std::find(nodes.begin(), nodes.end(), removed_node); - nodes.erase(removed_ite); + if (removed_ite != nodes.end()) { + nodes.erase(removed_ite); + } } } } diff --git a/src/operators/fusion_conv_add_add_prelu.cpp b/src/operators/fusion_conv_add_add_prelu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5104c989415eee46e66bdbf419fc6ecf7a2baa34 --- /dev/null +++ b/src/operators/fusion_conv_add_add_prelu.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDADDPRELU_OP + +#include "fusion_conv_add_add_prelu_op.h" +#include "operators/math/conv_func.h" + +namespace paddle_mobile { +namespace operators { + +template +void FusionConvAddAddPReluOp::InferShape() const { + auto in_dims = this->param_.Input()->dims(); + auto filter_dims = this->param_.Filter()->dims(); + const std::vector &strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + int groups = this->param_.Groups(); + std::vector dilations = this->param_.Dilations(); + + PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && + dilations.size() == paddings.size() && + paddings.size() == strides.size()), + "ConvParam is not suitable"); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + output_shape.push_back( + math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], + paddings[i], strides[i])); + } + framework::DDim ddim = framework::make_ddim(output_shape); + this->param_.Output()->Resize(ddim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp); +#endif + +#endif diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d91b4d28d728efb4ecf817294f37e67ac19cfe72 --- /dev/null +++ b/src/operators/fusion_conv_add_add_prelu_op.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDADDPRELU_OP + +#pragma once + +#include +#include +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/conv_add_add_prelu_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher { + public: + FusionConvAddAddPReluOpMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_PRELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, + {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}}, + {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}}, + + removed_nodes); + } + std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; } + + std::vector> NeedCheck() { + DLOG << " conv add add prelu check add X "; + return {{2, "Y"}, {2, "X"}}; + } +}; + +template +class FusionConvAddAddPReluOp + : public framework::OperatorWithKernel< + DeviceType, FusionConvAddAddPReluParam, + operators::ConvAddAddPReluKernel> { + public: + FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionConvAddAddPReluParam, + operators::ConvAddAddPReluKernel>( + type, inputs, outputs, attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, FusionConvAddAddPReluParam, + operators::ConvAddAddPReluKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +#ifdef PADDLE_MOBILE_CPU + +#ifndef CONV_ADD_ADD_PRELU_REGISTER +#define CONV_ADD_ADD_PRELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar( + new FusionConvAddAddPReluOpMatcher()); +#endif + +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA + +#ifndef CONV_ADD_ADD_PRELU_REGISTER +#define CONV_ADD_ADD_PRELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar( + new FusionConvAddAddPReluOpMatcher()); +#endif + +#endif + +} // namespace operators +} // namespace paddle_mobile + +#ifdef PADDLE_MOBILE_CPU +USE_OP_CPU(fusion_conv_add_add_prelu); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +USE_OP_FPGA(fusion_conv_add_add_prelu); +#endif + +#endif diff --git a/src/operators/fusion_conv_add_prelu_op.cpp b/src/operators/fusion_conv_add_prelu_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0cd30ae6888cd2372b0968717de14f9ca3c72e18 --- /dev/null +++ b/src/operators/fusion_conv_add_prelu_op.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDPRELU_OP + +#include "fusion_conv_add_prelu_op.h" +#include "operators/math/conv_func.h" + +namespace paddle_mobile { +namespace operators { + +template +void FusionConvAddPReluOp::InferShape() const { + auto in_dims = this->param_.Input()->dims(); + auto filter_dims = this->param_.Filter()->dims(); + const std::vector &strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + int groups = this->param_.Groups(); + std::vector dilations = this->param_.Dilations(); + + PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && + dilations.size() == paddings.size() && + paddings.size() == strides.size()), + "ConvParam is not suitable"); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + output_shape.push_back( + math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], + paddings[i], strides[i])); + } + framework::DDim ddim = framework::make_ddim(output_shape); + this->param_.Output()->Resize(ddim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp); +#endif + +#endif diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4c968be68230fe6252e72655f47b2a347f720526 --- /dev/null +++ b/src/operators/fusion_conv_add_prelu_op.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDPRELU_OP + +#pragma once + +#include +#include +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/conv_add_prelu_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher { + public: + FusionConvAddPReluOpMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_PRELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, + {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}} + + }, + + removed_nodes); + } + std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; } +}; + +template +class FusionConvAddPReluOp + : public framework::OperatorWithKernel< + DeviceType, FusionConvAddPReluParam, + operators::ConvAddPReluKernel> { + public: + FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionConvAddPReluParam, + operators::ConvAddPReluKernel>(type, inputs, outputs, + attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, FusionConvAddPReluParam, + operators::ConvAddPReluKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +#ifdef PADDLE_MOBILE_CPU + +#ifndef CONV_ADD_PRELU_REGISTER +#define CONV_ADD_PRELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar( + new FusionConvAddPReluOpMatcher()); +#endif + +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA + +#ifndef CONV_ADD_PRELU_REGISTER +#define CONV_ADD_PRELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar( + new FusionConvAddPReluOpMatcher()); +#endif + +#endif + +} // namespace operators +} // namespace paddle_mobile + +#ifdef PADDLE_MOBILE_CPU +USE_OP_CPU(fusion_conv_add_prelu); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +USE_OP_FPGA(fusion_conv_add_prelu); +#endif + +#endif diff --git a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..74b88f5d4f5e24b1401803c8c48d99319f412d1b --- /dev/null +++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDADDPRELU_OP + +#include "operators/kernel/conv_add_add_prelu_kernel.h" +#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddAddPReluKernel::Init( + FusionConvAddAddPReluParam *param) { + return true; +} + +template <> +void ConvAddAddPReluKernel::Compute( + const FusionConvAddAddPReluParam ¶m) const { + ConvAddAddPReluCompute(param); +} +template class ConvAddAddPReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5930cfdcfc0f983c9f07754113dc37d5122d19f0 --- /dev/null +++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDPRELU_OP + +#include "operators/kernel/conv_add_prelu_kernel.h" +#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddPReluKernel::Init(FusionConvAddPReluParam *param) { + return true; +} + +template <> +void ConvAddPReluKernel::Compute( + const FusionConvAddPReluParam ¶m) const { + ConvAddPReluCompute(param); +} +template class ConvAddPReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/prelu_kernel.cpp b/src/operators/kernel/arm/prelu_kernel.cpp index eda944a2ff44a0551ef0342c64e6e955c96b356d..e1ec927fb13d1f4a2e600d46f65f2806448059d9 100644 --- a/src/operators/kernel/arm/prelu_kernel.cpp +++ b/src/operators/kernel/arm/prelu_kernel.cpp @@ -16,6 +16,9 @@ limitations under the License. */ #include "operators/kernel/prelu_kernel.h" #include +#if __ARM_NEON +#include +#endif namespace paddle_mobile { namespace operators { @@ -37,32 +40,81 @@ void PReluKernel::Compute(const PReluParam ¶m) const { auto *alpha = param.InputAlpha(); auto *out = param.Out(); std::string mode = param.Mode(); - const auto *x_ptr = x->data(); + auto *x_ptr = x->data(); auto *o_ptr = out->mutable_data(); - const auto *alpha_ptr = alpha->data(); + auto *alpha_ptr = alpha->data(); int numel = x->numel(); auto dim = x->dims(); + int k = dim[0] * dim[1]; + int n = dim[2] * dim[3]; int index = 0; int i = 0; int temp = 0; +#if __ARM_NEON + #pragma omp parallel for + for (int i = 0; i < k; i++) { + float32x4_t zero = vdupq_n_f32(0.0); + float32x4_t cv; + float32x4_t cv1; + float32x4_t cv2; + float32x4_t pv; + for (int j = 0; (j + 3) < n; j += 4) { + const float *in = x_ptr + i * n + j; + float *out = o_ptr + i * n + j; + cv = vld1q_f32(in); + cv1 = vmaxq_f32(cv, zero); + cv2 = vminq_f32(cv, zero); + if (mode == "channel") { + cv2 = vmulq_n_f32(cv2, alpha_ptr[i]); + } else if (mode == "element") { + pv = vld1q_f32(alpha_ptr + i * n + j); + cv2 = vmulq_f32(cv2, pv); + } else { + cv2 = vmulq_n_f32(cv2, alpha_ptr[0]); + } + cv = vaddq_f32(cv1, cv2); + vst1q_f32(out, cv); + } + int j; + for (j = 0; (j + 3) < n; j += 4) { + } + for (int m = j; m < n; m++) { + if (mode == "channel") { + o_ptr[i * n + m] = x_ptr[i * n + m] > 0 + ? x_ptr[i * n + m] + : alpha_ptr[i] * x_ptr[i * n + m]; + } else if (mode == "element") { + o_ptr[i * n + m] = x_ptr[i * n + m] > 0 + ? x_ptr[i * n + m] + : alpha_ptr[i * n + m] * x_ptr[i * n + m]; + } else { + o_ptr[i * n + m] = x_ptr[i * n + m] > 0 + ? x_ptr[i * n + m] + : alpha_ptr[0] * x_ptr[i * n + m]; + } + } + } + +#else if (mode == "channel") { temp = numel / (dim[0] * dim[1]); - #pragma omp parallel for +#pragma omp parallel for for (i = 0; i < numel; i++) { index = (i / temp) % dim[1]; o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; } } else if (mode == "element") { - #pragma omp parallel for +#pragma omp parallel for for (i = 0; i < numel; i++) { o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i]; } } else { - #pragma omp parallel for +#pragma omp parallel for for (i = 0; i < numel; i++) { o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; } } +#endif } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..a19c67e68366fc57a305e0dbb955229a763737d9 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDADDPRELU_OP + +#pragma once +#include +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor bias = *param.Bias(); + Tensor bias1 = *param.Bias1(); + int axis = param.Axis(); + Tensor *output = param.Output(); + + float *biase_data = bias.data(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + Tensor aa = *param.InputAlpha(); + float *p = aa.data(); + + std::string mode = param.Mode(); + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor bias1_batch = bias1.Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step); + float *biase_data1 = bias1_slice.data(); + // int n = bias1_slice.dims()[0]; + // int m = bias1_slice.dims()[1]; + // for(int i=0;i(filter_slice, false, col_matrix, + // false, + // static_cast(1), + // &out_slice, + // static_cast(1), true, + // biase_data); + math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice, + p, mode, biase_data, biase_data1); + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..df63379d967606e15106937534bb82496ee83b4e --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDPRELU_OP + +#pragma once +#include +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void ConvAddPReluCompute(const FusionConvAddPReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor bias = *param.Bias(); + // DLOG<<"yangfei"; + // DLOG<(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + Tensor aa = *param.InputAlpha(); + float *p = aa.data(); + std::string mode = param.Mode(); + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + // math::matmul(filter_slice, false, col_matrix, + // false, + // static_cast(1), + // &out_slice, + // static_cast(1), true, + // biase_data); + math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice, + p, mode, biase_data, nullptr); + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5715cd46d5a6c7e80ab5ff77ba83c7973e1db811 --- /dev/null +++ b/src/operators/kernel/conv_add_add_prelu_kernel.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef FUSION_CONVADDADDPRELU_OP + +#include +#include "framework/ddim.h" +#include "framework/operator.h" +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::DDim; +using framework::OpKernelBase; + +template +class ConvAddAddPReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionConvAddAddPReluParam ¶m) const; + bool Init(FusionConvAddAddPReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..a109f84cf09b4d0e2469a1885b902c0f70acc6c8 --- /dev/null +++ b/src/operators/kernel/conv_add_prelu_kernel.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef FUSION_CONVADDPRELU_OP + +#include +#include "framework/ddim.h" +#include "framework/operator.h" +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::DDim; +using framework::OpKernelBase; + +template +class ConvAddPReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionConvAddPReluParam ¶m) const; + bool Init(FusionConvAddPReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index de0a4f6294420402199654b51e19fd1b7d0eadae..c9084a9d807ab43ad60b1661b5b1a7ecc67c8479 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "operators/math/gemm.h" +#include #include "common/log.h" #include "memory/t_malloc.h" #if __ARM_NEON @@ -707,6 +708,25 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, } } +void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, + float *c, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1) { +#pragma omp parallel for + for (int j = 0; j < nc; j += NR) { + for (int i = 0; i < mc; i += MR) { +#if __aarch64__ + // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#else + // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#endif + } + } + WriteWithAddPRelu(mc, nc, c, C, ldc, p, mode, bias, bias1); +} + #if __ARM_NEON #if __aarch64__ @@ -988,6 +1008,82 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, } } +// C = A * B + C,prelu(C) +void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float *c_ptr, *C_ptr; + float32x4_t cv; + float32x4_t cv1; + float32x4_t biasv; + float32x4_t biasv1; + float32x4_t zero = vdupq_n_f32(0.0); + float32x4_t pv; + float *ptr = p; + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * NC; + C_ptr = C + i * ldc; + biasv = vld1q_dup_f32(bias + i); + if (bias1 == nullptr) { + biasv1 = zero; + } else { + biasv1 = vld1q_dup_f32(bias1 + i); + } + + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv = vaddq_f32(cv, biasv); + cv = vaddq_f32(cv, biasv1); + cv = vmaxq_f32(cv, zero); + cv1 = vminq_f32(cv, zero); + if (mode == "channel") { + cv1 = vmulq_n_f32(cv1, ptr[i]); + } else if (mode == "element") { + pv = vld1q_f32(ptr); + cv1 = vmulq_f32(cv1, pv); + ptr = ptr + 4; + } else { + cv1 = vmulq_n_f32(cv1, ptr[0]); + } + cv = vaddq_f32(cv, cv1); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv = vaddq_f32(cv, biasv); + cv = vaddq_f32(cv, biasv1); + cv = vmaxq_f32(cv, zero); + cv1 = vminq_f32(cv, zero); + if (mode == "channel") { + cv1 = vmulq_n_f32(cv1, ptr[i]); + } else if (mode == "element") { + pv = vld1q_f32(ptr); + cv1 = vmulq_f32(cv1, pv); + ptr = ptr + 4; + } else { + cv1 = vmulq_n_f32(cv1, ptr[0]); + } + cv = vaddq_f32(cv, cv1); + if (_nc1 >= 1) { + vst1q_lane_f32(C_ptr, cv, 0); + C_ptr++; + } + if (_nc1 >= 2) { + vst1q_lane_f32(C_ptr, cv, 1); + C_ptr++; + } + if (_nc1 >= 3) { + vst1q_lane_f32(C_ptr, cv, 2); + C_ptr++; + } + } + } +} + // C = A * B, batchnorm(C) void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, float *new_bias) { @@ -1971,6 +2067,145 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, } } +void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1) { + if (nc < 4) { + if (bias1 == nullptr) { + for (int i = 0; i < mc; ++i) { + for (int j = 0; j < nc; ++j) { + float r = *c + *bias; + if (r < 0) { + r = *p; + } + c++; + } + bias++; + p++; + } + } else { + for (int i = 0; i < mc; ++i) { + for (int j = 0; j < nc; ++j) { + float r = *c + *bias; + r += *bias1; + if (r < 0) { + r *= *p; + } + c++; + bias1++; + } + bias++; + p++; + } + } + return; + } + + int nc1 = nc / 8; + int step = 4 * (ldc - nc); + int step1 = 4 * (NC - nc); + + if (bias1 == nullptr) { + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r5, %[nc1] \n\t" + "vld1.32 {d0}, [%[bias]] \n\t" + "vld1.32 {d1}, [%[p]] \n\t" + "vdup.32 q1, d0[0] \n\t" + "vdup.32 q2, d1[0] \n\t" + + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "pld [%[c], #32] \n\t" + "vld1.32 {q3, q4}, [%[c]]! \n\t" + "vadd.f32 q3, q3, q1 \n\t" + "vadd.f32 q4, q4, q1 \n\t" + "vmax.f32 q5, q3, q14 \n\t" + "vmin.f32 q7, q3, q14 \n\t" + "vmax.f32 q6, q4, q14 \n\t" + "vmin.f32 q8, q4, q14 \n\t" + "vmla.f32 q5, q7, q2 \n\t" + "vmla.f32 q6, q8, q2 \n\t" + "vst1.32 {q5, q6}, [%[C]]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[p], %[p], #4 \n\t" + "add %[bias], %[bias], #4 \n\t" + "add %[c], %[c], %[step1] \n\t" + "add %[C], %[C], %[step] \n\t" + + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), + [step] "r"(step), [step1] "r"(step1), [p] "r"(p), [bias] "r"(bias), + [bias1] "r"(bias1) + : "memory", "r5", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8"); + } else { + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[mc], %[mc], #1 \n\t" + "blt end_mc_%= \n\t" + "loop_mc_%=: \n\t" + + "mov r5, %[nc1] \n\t" + "vld1.32 {d0}, [%[bias]] \n\t" + "vld1.32 {d1}, [%[p]] \n\t" + "vdup.32 q1, d0[0] \n\t" + "vdup.32 q2, d1[0] \n\t" + + "subs r5, r5, #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "pld [%[c], #32] \n\t" + "pld [%[bias1], #32] \n\t" + "vld1.32 {q3, q4}, [%[c]]! \n\t" + "vld1.32 {q9, q10}, [%[bias1]]! \n\t" + "vadd.f32 q3, q3, q1 \n\t" + "vadd.f32 q4, q4, q1 \n\t" + "vadd.f32 q3, q3, q9 \n\t" + "vadd.f32 q4, q4, q10 \n\t" + "vmax.f32 q5, q3, q14 \n\t" + "vmin.f32 q7, q3, q14 \n\t" + "vmax.f32 q6, q4, q14 \n\t" + "vmin.f32 q8, q4, q14 \n\t" + "vmla.f32 q5, q7, q2 \n\t" + "vmla.f32 q6, q8, q2 \n\t" + "vst1.32 {q5, q6}, [%[C]]! \n\t" + + "subs r5, r5, #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "add %[p], %[p], #4 \n\t" + "add %[bias], %[bias], #4 \n\t" + "add %[c], %[c], %[step1] \n\t" + "add %[C], %[C], %[step] \n\t" + + "subs %[mc], %[mc], #1 \n\t" + "bge loop_mc_%= \n\t" + "end_mc_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), + [step] "r"(step), [step1] "r"(step1), [p] "r"(p), [bias] "r"(bias), + [bias1] "r"(bias1) + : "memory", "r5", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10"); + } +} + // C = A * B, batchnorm(C) void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, float *bias) { @@ -2513,6 +2748,9 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {} void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {} +void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1) {} + void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, float *new_bias) {} @@ -2642,6 +2880,74 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, paddle_mobile::memory::Free(zero); } +void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, + const float *B, int ldb, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 32 * 1024; + int L2 = 0.5 * 1024 * 1024; + + KC = k; + MC = L1 / (KC * sizeof(float)); + NC = L2 / (KC * sizeof(float)); + + // make sure MC is multiple of MR, and NC is multiple of NR + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + + for (int l = 0; l < KC; ++l) { + zero[l] = 0; + } + + int mc, nc; + for (int j = 0; j < n; j += NC) { + nc = s_min(n - j, NC); +#if __aarch64__ + // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); + PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB); +#else + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); +#endif + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); +#if __aarch64__ + PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); + // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); +#else + PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); +#endif + if (bias1 == nullptr) { + InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc, + p + i, mode, bias + i, nullptr); + } else { + InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc, + p + i, mode, bias + i, bias1 + i * ldc + j); + } + } + } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); +} + // 32位 float 矩阵乘法 void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, @@ -2857,6 +3163,123 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, paddle_mobile::memory::Free(zero); } +void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, + const float *B, int ldb, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1) { +#ifdef _OPENMP + int max_threads = omp_get_max_threads(); +#else + int max_threads = 1; +#endif + + int L1 = 32 * 1024; + KC = k; + if (m > n) { + // 对 A 分块 + MC = L1 / (KC * sizeof(float)); + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + // 补齐 B + NC = (n + NR - 1) / NR * NR; + +#if __aarch64__ + procPackA = PackMatrixA_6r; + procPackB = PackMatrixB_omp_16c; + procAddDot = AddDot6x16; +#else + procPackA = PackMatrixA_6r; + procPackB = PackMatrixB_omp_8c; + procAddDot = AddDot6x8; +#endif + + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + procPackB(KC, NC, NC % NR, B, ldb, packedB); + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); + } else { + // 对 B 分块 + NC = L1 / (KC * sizeof(float)); + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + // 补齐 A + MC = (m + MR - 1) / MR * MR; + +#if __aarch64__ + procPackA = PackMatrixA_omp_6r; + procPackB = PackMatrixB_16c; + procAddDot = AddDot6x16; +#else + procPackA = PackMatrixA_omp_6r; + procPackB = PackMatrixB_8c; + procAddDot = AddDot6x8; +#endif + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + procPackA(MC, KC, MC % MR, A, lda, packedA); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); + } + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + memset(static_cast(zero), 0, sizeof(float) * KC); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); + + if (m > n) { +#pragma omp parallel for + for (int i = 0; i < m; i += MC) { +#ifdef _OPENMP + int local_threads = omp_get_thread_num(); +#else + int local_threads = 0; +#endif + + int mc; + mc = s_min(m - i, MC); + float *local_A = packedA + MC * KC * local_threads; + float *local_C = packedC + MC * NC * local_threads; + procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); + if (bias1 == nullptr) { + InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc, + p + i, mode, bias + i, nullptr); + } else { + InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc, + p + i, mode, bias + i, bias1 + i * ldc); + } + } + } else { +#pragma omp parallel for + for (int j = 0; j < n; j += NC) { +#ifdef _OPENMP + int local_threads = omp_get_thread_num(); +#else + int local_threads = 0; +#endif + + int nc; + nc = s_min(n - j, NC); + float *local_B = packedB + KC * NC * local_threads; + float *local_C = packedC + MC * NC * local_threads; + procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); + if (bias1 == nullptr) { + InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p, + mode, bias, nullptr); + } else { + InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p, + mode, bias, bias1 + j); + } + } + } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); +} + void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { #if __ARM_NEON #if __aarch64__ diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 40199faa4c30ec965a3980f44f1dbb6ae7d6799b..6139f1b45f3c6e76d859625ca000ea6d46d3c328 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include "common/log.h" // 矩阵取值运算宏,假设矩阵按行存储 #define A(i, j) A[(i)*lda + (j)] @@ -79,6 +81,9 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, const float *b, float beta, float *c, float *C, int ldc, bool relu, float *new_scale, float *new_bias); +void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, + float *c, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1); /* // 向量矩阵乘法 (M = 1) void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, @@ -108,6 +113,9 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc); void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias); // C = A * B + C, relu(C) void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc); +// C = A * B + C,prelu(C) +void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1); // C = A * B + bias ,relu(C) void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, float *bias); @@ -146,6 +154,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *new_scale, float *new_bias); +void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, + const float *B, int ldb, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1); + // 32位 float 矩阵乘法(openmp 多线程版本) void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, @@ -156,6 +168,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *new_scale, float *new_bias); +void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, + const float *B, int ldb, float *C, int ldc, float *p, + std::string mode, float *bias, float *bias1); + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index 381624250af87f4eeff7cf316a2f0f346c399137..c5192441b2e89f4a5346f5d580fe87890becc432 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -87,6 +87,37 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, new_bias->data() + group); #endif } +void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, + const framework::Tensor &matrix_b, bool trans_b, + framework::Tensor *matrix_out, float *p, std::string mode, + float *bias, float *bias1) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + // PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && + // dim_out.size() == + // 2, + // "The input and output of matmul be matrix"); + // + // PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && + // platform::is_cpu_place(matrix_b.place()) + // && + // platform::is_cpu_place(matrix_out->place()), + // "Matrix must all be in CPUPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (!trans_a) ? dim_a[1] : dim_a[0]; + +#ifdef _OPENMP + SgemmWithPRelu_omp(M, N, K, matrix_a.data(), K, matrix_b.data(), + N, matrix_out->data(), N, p, mode, bias, bias1); +#else + SgemmWithPRelu(M, N, K, matrix_a.data(), K, matrix_b.data(), N, + matrix_out->data(), N, p, mode, bias, bias1); + +#endif +} } // namespace math } // namespace operators diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 74a3f5b8f58f5817c3de426d723a273a8a041614..26ec50872b1dccf1bc2f24cfea284de02e57fc9c 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -33,6 +33,11 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, framework::Tensor *matrix_out, T beta, bool relu, framework::Tensor *new_scale, framework::Tensor *new_bias, int group); + +void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, + const framework::Tensor &matrix_b, bool trans_b, + framework::Tensor *matrix_out, float *p, std::string mode, + float *bias, float *bias1); } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 8acfac36183158e19d5060a07ac741d54ecd3066..8f289b24ffa16e4af92ddff77b722fd458bc7c84 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -87,12 +87,21 @@ class OpParam { static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) { return GetVarValue("X", inputs, scope); } + template + static T *InputXFrom1(const VariableNameMap &inputs, const Scope &scope) { + return GetVarValue1("addX", inputs, scope); + } template static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) { return GetVarValue("Y", inputs, scope); } + template + static T *InputYFrom1(const VariableNameMap &inputs, const Scope &scope) { + return GetVarValue1("Y", inputs, scope); + } + template static T *InputZFrom(const VariableNameMap &inputs, const Scope &scope) { return GetVarValue("Z", inputs, scope); @@ -222,6 +231,26 @@ class OpParam { } } + static std::string getkey(const string &key, const VariableNameMap &var_map, + int index) { + auto var_vec = var_map.at(key); + return var_vec[index]; + } + + template + static T *GetVarValue1(const string &key, const VariableNameMap &var_map, + const Scope &scope) { + PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0, + "%s is not contained in var_map", key.c_str()) + auto var_vec = var_map.at(key); + if (!var_vec.empty()) { + auto var = scope.FindVar(var_vec[1]); + return var->GetMutable(); + } else { + return nullptr; + } + } + template static vector GetMultiVarValue(const string &key, const VariableNameMap &var_map, @@ -1184,6 +1213,163 @@ class FusionConvAddReluParam : public FusionConvAddParam { }; #endif +#ifdef FUSION_CONVADDPRELU_OP +template +class FusionConvAddPReluParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionConvAddPReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + alpha_ = InputAlphaFrom(inputs, scope); + mode_ = GetAttr("mode", attrs); + framework::DDim dims = alpha_->dims(); + bias_ = InputYFrom(inputs, scope); + axis_ = GetAttr("axis", attrs); + filter_ = FilterFrom(inputs, scope); + input_ = InputFrom(inputs, scope); + output_ = OutFrom(outputs, scope); + strides_ = GetAttr>("strides", attrs); + paddings_ = GetAttr>("paddings", attrs); + dilations_ = GetAttr>("dilations", attrs); + groups = GetAttr("groups", attrs); + } + const RType *InputAlpha() const { return alpha_; } + const std::string &Mode() const { return mode_; } + RType *Bias() const { return bias_; } + + const int &Axis() const { return axis_; } + + const RType *Input() const { return input_; } + +#ifdef PADDLE_MOBILE_FPGA + RType *Filter() const { return filter_; } +#else + const RType *Filter() const { return filter_; } +#endif + + RType *Output() const { return output_; } + + const vector &Strides() const { return strides_; } + + const vector &Paddings() const { return paddings_; } + + const vector &Dilations() const { return dilations_; } + + const int &Groups() const { return groups; } + + protected: + RType *bias_; + int axis_; + RType *input_; + RType *output_; + RType *filter_; + vector strides_; + vector paddings_; + vector dilations_; + int groups; + RType *alpha_; + std::string mode_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::ConvArgs fpga_conv_args; + + public: + const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } +#endif +}; +#endif + +#ifdef FUSION_CONVADDADDPRELU_OP +template +class FusionConvAddAddPReluParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionConvAddAddPReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + bias1_ = InputYFrom1(inputs, scope); + alpha_ = InputAlphaFrom(inputs, scope); + mode_ = GetAttr("mode", attrs); + framework::DDim dims = alpha_->dims(); + bias_ = InputYFrom(inputs, scope); + axis_ = GetAttr("axis", attrs); + filter_ = FilterFrom(inputs, scope); + input_ = InputFrom(inputs, scope); + output_ = OutFrom(outputs, scope); + strides_ = GetAttr>("strides", attrs); + paddings_ = GetAttr>("paddings", attrs); + dilations_ = GetAttr>("dilations", attrs); + groups = GetAttr("groups", attrs); + keyOutput_ = getkey("addOut", inputs, 0); + keyX1_ = getkey("addX", inputs, 1); + keyY1_ = getkey("Y", inputs, 1); + if (keyX1_ == keyOutput_) { + bias1_ = InputYFrom1(inputs, scope); + } else if (keyY1_ == keyOutput_) { + bias1_ = InputXFrom1(inputs, scope); + } + } + const RType *InputAlpha() const { return alpha_; } + const std::string &Mode() const { return mode_; } + const RType *Bias1() const { return bias1_; } + + RType *Bias() const { return bias_; } + + const int &Axis() const { return axis_; } + + const RType *Input() const { return input_; } + +#ifdef PADDLE_MOBILE_FPGA + RType *Filter() const { return filter_; } +#else + const RType *Filter() const { return filter_; } +#endif + + RType *Output() const { return output_; } + + const vector &Strides() const { return strides_; } + + const vector &Paddings() const { return paddings_; } + + const vector &Dilations() const { return dilations_; } + + const int &Groups() const { return groups; } + + protected: + RType *bias_; + int axis_; + RType *input_; + RType *output_; + RType *filter_; + vector strides_; + vector paddings_; + vector dilations_; + int groups; + RType *alpha_; + std::string mode_; + RType *bias1_; + std::string keyOutput_; + std::string keyX1_; + std::string keyY1_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::ConvArgs fpga_conv_args; + + public: + const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } +#endif +}; +#endif + #ifdef FUSION_CONVADDBNRELU_OP template class FusionConvAddBNReluParam : public OpParam { diff --git a/tools/op.cmake b/tools/op.cmake index 74f296646e7df7991e59daedc34ecc68895ad5cb..38c511400532dd73de03aeff6686c881a3c0ad26 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -124,6 +124,7 @@ if(NOT FOUND_MATCH) set(DEPTHWISECONV_OP ON) set(ELEMENTWISEADD_OP ON) set(FUSION_CONVADD_OP ON) + set(FUSION_CONVADDPRELU_OP ON) set(FUSION_CONVADDRELU_OP ON) set(FUSION_FC_OP ON) set(LRN_OP ON) @@ -137,6 +138,7 @@ if(NOT FOUND_MATCH) set(SOFTMAX_OP ON) set(TRANSPOSE_OP ON) set(FUSION_CONVADDBNRELU_OP ON) + set(FUSION_CONVADDADDPRELU_OP ON) set(FUSION_DWCONVBNRELU_OP ON) set(FUSION_CONVBNRELU_OP ON) set(PRELU_OP ON) @@ -192,6 +194,12 @@ endif() if (FUSION_CONVADDRELU_OP) add_definitions(-DFUSION_CONVADDRELU_OP) endif() +if (FUSION_CONVADDPRELU_OP) + add_definitions(-DFUSION_CONVADDPRELU_OP) +endif() +if (FUSION_CONVADDADDPRELU_OP) + add_definitions(-DFUSION_CONVADDADDPRELU_OP) +endif() if (FUSION_FC_OP) add_definitions(-DFUSION_FC_OP) endif()