diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h index 98af2ca6053fe544b49df4510b74ad0ac505b009..e72c55f5f736b81362f461952a706127998f9ade 100644 --- a/src/framework/load_ops.h +++ b/src/framework/load_ops.h @@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU); LOAD_OP2(fusion_conv_add_relu, CPU, FPGA); LOAD_FUSION_MATCHER(fusion_conv_add_relu); #endif -#ifdef FUSION_CONVADDADDPRELU_OP -LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu); -#endif #ifdef FUSION_CONVADD_OP LOAD_OP2(fusion_conv_add, CPU, MALI_GPU); LOAD_FUSION_MATCHER(fusion_conv_add); @@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn); #ifdef DROPOUT_OP LOAD_OP2(dropout, CPU, FPGA); #endif -#ifdef FUSION_CONVADDPRELU_OP -LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA); -LOAD_FUSION_MATCHER(fusion_conv_add_prelu); -#endif #ifdef FUSION_DWCONVBNRELU_OP LOAD_OP1(fusion_dwconv_bn_relu, CPU); LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu); diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp index 2c70f42f56530c2d21252d6b51c228e7c49ca8bf..ad778b1fef7fe400e1df645703cf3ebfb1b22727 100644 --- a/src/operators/conv_op.cpp +++ b/src/operators/conv_op.cpp @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "framework/op_proto_maker.h" #include "framework/op_registry.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -39,9 +39,9 @@ void ConvOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp index 2e7f193c5c9f66668411bb115da9d3cd980f8a6b..0e74654e1f661d55a263f9f9b57a1ba2a32dfd74 100644 --- a/src/operators/depthwise_conv_op.cpp +++ b/src/operators/depthwise_conv_op.cpp @@ -19,7 +19,7 @@ limitations under the License. */ #include "framework/op_proto_maker.h" #include "framework/op_registry.h" #include "operators/conv_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -40,9 +40,9 @@ void DepthwiseConvOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_conv_add_add_prelu_op.cpp b/src/operators/fusion_conv_add_add_prelu_op.cpp deleted file mode 100644 index 2f3d29dc74ed3a852b5c41a64d46b8710ebec599..0000000000000000000000000000000000000000 --- a/src/operators/fusion_conv_add_add_prelu_op.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDADDPRELU_OP - -#include "operators/fusion_conv_add_add_prelu_op.h" -#include "operators/math/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvAddAddPReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); - } - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu, - ops::FusionConvAddAddPReluOpMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp); -#endif - -#endif // FUSION_CONVADDADDPRELU_OP diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h deleted file mode 100644 index 8ecb4aa715e34a2a9e67abf097ab5622a6dacf19..0000000000000000000000000000000000000000 --- a/src/operators/fusion_conv_add_add_prelu_op.h +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDADDPRELU_OP - -#pragma once - -#include -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_add_add_prelu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher { - public: - FusionConvAddAddPReluOpMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_PRELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, - {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}}, - {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}}, - - removed_nodes); - } - std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; } - - std::vector> NeedCheck() { - DLOG << " conv add add prelu check add X "; - return {{2, "Y"}, {2, "X"}}; - } -}; - -template -class FusionConvAddAddPReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionConvAddAddPReluParam, - operators::ConvAddAddPReluKernel> { - public: - FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvAddAddPReluParam, - operators::ConvAddAddPReluKernel>( - type, inputs, outputs, attrs, scope) {} - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/fusion_conv_add_bn_op.cpp b/src/operators/fusion_conv_add_bn_op.cpp index e8daba7e9ba209cf078323ea79dd6f6a9b6e8200..27e3c04d62c29abe69adef7457bc633d294e2cdc 100644 --- a/src/operators/fusion_conv_add_bn_op.cpp +++ b/src/operators/fusion_conv_add_bn_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDBN_OP #include "operators/fusion_conv_add_bn_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionConvAddBNOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp index b9bc948fe0e77741a36f959e29eb2a4c82e82b72..8f162a2d29de32340b8f7f3fe3094a230212929d 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.cpp +++ b/src/operators/fusion_conv_add_bn_relu_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDBNRELU_OP #include "operators/fusion_conv_add_bn_relu_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionConvAddBNReluOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_conv_add_op.cpp b/src/operators/fusion_conv_add_op.cpp index 731bb631bb98490d580e0c6fe28c24312f6ccb57..49cf29c38e40f5a55fa0546e988d2860a6842f6b 100644 --- a/src/operators/fusion_conv_add_op.cpp +++ b/src/operators/fusion_conv_add_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADD_OP #include "operators/fusion_conv_add_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionConvAddOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_conv_add_prelu_op.cpp b/src/operators/fusion_conv_add_prelu_op.cpp deleted file mode 100644 index 9273af388c2c0a8644b29e1f40a5238b0e092523..0000000000000000000000000000000000000000 --- a/src/operators/fusion_conv_add_prelu_op.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDPRELU_OP - -#include "operators/fusion_conv_add_prelu_op.h" -#include "operators/math/conv_func.h" - -namespace paddle_mobile { -namespace operators { - -template -void FusionConvAddPReluOp::InferShape() const { - auto in_dims = this->param_.Input()->dims(); - auto filter_dims = this->param_.Filter()->dims(); - const std::vector &strides = this->param_.Strides(); - std::vector paddings = this->param_.Paddings(); - int groups = this->param_.Groups(); - std::vector dilations = this->param_.Dilations(); - - PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && - dilations.size() == paddings.size() && - paddings.size() == strides.size()), - "ConvParam is not suitable"); - - std::vector output_shape({in_dims[0], filter_dims[0]}); - for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); - } - framework::DDim ddim = framework::make_ddim(output_shape); - this->param_.Output()->Resize(ddim); -} - -} // namespace operators -} // namespace paddle_mobile - -namespace ops = paddle_mobile::operators; -REGISTER_FUSION_MATCHER(fusion_conv_add_prelu, - ops::FusionConvAddPReluOpMatcher); - -#ifdef PADDLE_MOBILE_CPU -REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp); -#endif -#ifdef PADDLE_MOBILE_FPGA -REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp); -#endif - -#endif diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h deleted file mode 100644 index 25aab72634da52c1a344d39b3f041531065efc5b..0000000000000000000000000000000000000000 --- a/src/operators/fusion_conv_add_prelu_op.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDPRELU_OP - -#pragma once - -#include -#include -#include "framework/operator.h" -#include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/conv_add_prelu_kernel.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher { - public: - FusionConvAddPReluOpMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_PRELU); - } - - void FolderNodes( - framework::Node *node, - std::vector> *removed_nodes) { - node->Folder(node_.Depth(), Type(), - {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}}, - removed_nodes); - } - std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; } -}; - -template -class FusionConvAddPReluOp - : public framework::OperatorWithKernel< - DeviceType, FusionConvAddPReluParam, - operators::ConvAddPReluKernel> { - public: - FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, - const framework::AttributeMap &attrs, - framework::Scope *scope) - : framework::OperatorWithKernel< - DeviceType, FusionConvAddPReluParam, - operators::ConvAddPReluKernel>(type, inputs, outputs, - attrs, scope) {} - - void InferShape() const override; - - protected: -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp index bb4b6666a881de0989d43840806b9d5d720b3b66..163dfba3cc8706dac96697974ef7224b3f625ae1 100644 --- a/src/operators/fusion_conv_add_relu_op.cpp +++ b/src/operators/fusion_conv_add_relu_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDRELU_OP #include "operators/fusion_conv_add_relu_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionConvAddReluOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); this->param_.Output()->Resize(ddim); diff --git a/src/operators/fusion_conv_bn_add_relu_op.cpp b/src/operators/fusion_conv_bn_add_relu_op.cpp index 9a3926353319aa267814097d93a6d9b1fa20bd2d..c2bb2c744d5599558f14e2f1d169b00a1492e135 100644 --- a/src/operators/fusion_conv_bn_add_relu_op.cpp +++ b/src/operators/fusion_conv_bn_add_relu_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVBNADDRELU_OP #include "operators/fusion_conv_bn_add_relu_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionConvBNAddReluOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_conv_bn_op.cpp b/src/operators/fusion_conv_bn_op.cpp index 7786cd713b5f838e22aa3080697d551609d81036..4939123a77a072ea410bfa96547b8a0ed276c28d 100644 --- a/src/operators/fusion_conv_bn_op.cpp +++ b/src/operators/fusion_conv_bn_op.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVBN_OP #include "operators/fusion_conv_bn_op.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -35,9 +36,9 @@ void FusionConvBNOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp index 54c9f85cbb7dc00bd0df5747caac8fd2ee9e2782..0e8eec65f2e46e1314c11b7f6bceade861445ef6 100644 --- a/src/operators/fusion_conv_bn_relu_op.cpp +++ b/src/operators/fusion_conv_bn_relu_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVBNRELU_OP #include "operators/fusion_conv_bn_relu_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionConvBNReluOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp index f5040987e42f9c0b3068d730a9926b9fcff8c8c3..d4c04f67fc637266cf95af2e7fe518682e212d98 100644 --- a/src/operators/fusion_dwconv_bn_relu_op.cpp +++ b/src/operators/fusion_dwconv_bn_relu_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_DWCONVBNRELU_OP #include "operators/fusion_dwconv_bn_relu_op.h" -#include "operators/math/conv_func.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" namespace paddle_mobile { namespace operators { @@ -36,9 +36,9 @@ void FusionDWConvBNReluOp::InferShape() const { std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < strides.size(); ++i) { - output_shape.push_back( - math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], - paddings[i], strides[i])); + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); } framework::DDim ddim = framework::make_ddim(output_shape); diff --git a/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp deleted file mode 100644 index 2f6f5f3ac719b3fd32aac54ce36eb534f7d99dd7..0000000000000000000000000000000000000000 --- a/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDADDPRELU_OP - -#include "operators/kernel/conv_add_add_prelu_kernel.h" -#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddAddPReluKernel::Init( - FusionConvAddAddPReluParam *param) { - return true; -} - -template <> -void ConvAddAddPReluKernel::Compute( - const FusionConvAddAddPReluParam ¶m) { - ConvAddAddPReluCompute(param); -} -template class ConvAddAddPReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp index ae67147ffdc4418b263458be16ecb9d33a89d8a0..f9489330ca33a7b055ff91e9c8e259d1feb1e827 100644 --- a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "operators/kernel/arm/convolution/conv_common.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" +#include "operators/math/channel_wise.h" namespace paddle_mobile { namespace operators { @@ -62,34 +63,24 @@ void ConvAddBNReluKernel::Compute( const FusionConvAddBNReluParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); + DepthwiseConv3x3(param); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_GEMM_FLOAT: - ConvBNReluBasic>(param); + GemmConv(param); break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); } + math::ScaleAddChannelWise(param.Output(), param.NewScale(), + param.NewBias(), param.Output()); } template class ConvAddBNReluKernel; diff --git a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp index 76c2200df3b6607cad51d2430e7b5a2cda41c6cd..5a44b083a37b19637c053655e23196385d432971 100644 --- a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp @@ -16,8 +16,8 @@ limitations under the License. */ #include "operators/kernel/conv_add_kernel.h" #include "operators/kernel/arm/convolution/conv_common.h" -#include "operators/kernel/central-arm-func/conv_add_arm_func.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" +#include "operators/math/channel_wise.h" namespace paddle_mobile { namespace operators { @@ -32,34 +32,25 @@ template <> void ConvAddKernel::Compute(const FusionConvAddParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::AddChannelWise(param.Output(), param.Bias(), - param.Output()); break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), param.Paddings(), param.Output()); - math::AddChannelWise(param.Output(), param.Bias(), - param.Output()); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); - math::AddChannelWise(param.Output(), param.Bias(), - param.Output()); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); - math::AddChannelWise(param.Output(), param.Bias(), - param.Output()); break; case ConvParam::EXEC_GEMM_FLOAT: - ConvAddBasic(param); + GemmConv(param); break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); } + math::AddChannelWise(param.Output(), param.Bias(), param.Output()); } template class ConvAddKernel; diff --git a/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp deleted file mode 100644 index f04a9a7d746f2d970196945707bd05409c5fa340..0000000000000000000000000000000000000000 --- a/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDPRELU_OP - -#include "operators/kernel/conv_add_prelu_kernel.h" -#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool ConvAddPReluKernel::Init(FusionConvAddPReluParam *param) { - return true; -} - -template <> -void ConvAddPReluKernel::Compute( - const FusionConvAddPReluParam ¶m) { - ConvAddPReluCompute(param); -} -template class ConvAddPReluKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp index e0387f6444bceb3cee8d9be34530fe2d81d56af5..a9efae96e94afa24b48ed46214ff1fdd8ec50d83 100644 --- a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp @@ -17,6 +17,7 @@ limitations under the License. */ #include "operators/kernel/conv_add_relu_kernel.h" #include "operators/kernel/arm/convolution/conv_common.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" +#include "operators/math/channel_wise.h" namespace paddle_mobile { namespace operators { @@ -32,30 +33,23 @@ void ConvAddReluKernel::Compute( const FusionConvAddReluParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::AddChannelWise(param.Output(), param.Bias(), param.Output()); - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::AddChannelWise(param.Output(), param.Bias(), param.Output()); + DepthwiseConv3x3(param); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); - math::AddChannelWise(param.Output(), param.Bias(), param.Output()); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); - math::AddChannelWise(param.Output(), param.Bias(), param.Output()); break; case ConvParam::EXEC_GEMM_FLOAT: - ConvAddReluBasic>(param); + GemmConv(param); break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); } + math::AddChannelWise(param.Output(), param.Bias(), param.Output()); } template class ConvAddReluKernel; diff --git a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp index f591833887960adc37fe741d8a36946474ffcf8e..26e0e343675d8e90e23d38084537bac99793142b 100644 --- a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "operators/kernel/arm/convolution/conv_common.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" +#include "operators/math/channel_wise.h" namespace paddle_mobile { namespace operators { @@ -62,34 +63,24 @@ void ConvBNAddReluKernel::Compute( const FusionConvBNAddReluParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); + DepthwiseConv3x3(param); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_GEMM_FLOAT: - ConvBNReluBasic>(param); + GemmConv(param); break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); } + math::ScaleAddChannelWise(param.Output(), param.NewScale(), + param.NewBias(), param.Output()); } template class ConvBNAddReluKernel; diff --git a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp index 352df8a38996900dc53f0483c2ff28133444b066..15129d72895a89a4cba918d7a8da747a17962f58 100644 --- a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "operators/kernel/arm/convolution/conv_common.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" +#include "operators/math/channel_wise.h" namespace paddle_mobile { namespace operators { @@ -61,34 +62,24 @@ void ConvBNReluKernel::Compute( const FusionConvBNReluParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); + DepthwiseConv3x3(param); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_GEMM_FLOAT: - ConvBNReluBasic>(param); + GemmConv(param); break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); } + math::ScaleAddChannelWise(param.Output(), param.NewScale(), + param.NewBias(), param.Output()); } template class ConvBNReluKernel; diff --git a/src/operators/kernel/arm/convolution/conv_kernel.cpp b/src/operators/kernel/arm/convolution/conv_kernel.cpp index 6771b88d4b981881232c73e2821f97044a008148..1c6ac2015daeab4f7ffe8a3e178222a6f4c4c4e8 100644 --- a/src/operators/kernel/arm/convolution/conv_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_kernel.cpp @@ -32,10 +32,10 @@ bool ConvKernel::Init(ConvParam *param) { template <> void ConvKernel::Compute(const ConvParam ¶m) { switch (param.ExecMode()) { +#ifndef __aarch64__ case ConvParam::EXEC_GEMM_INT8: GemmConv(param); break; -#ifndef __aarch64__ case ConvParam::EXEC_DEPTHWISE3x3_INT8: DepthwiseConv3x3(param); break; @@ -44,12 +44,8 @@ void ConvKernel::Compute(const ConvParam ¶m) { break; #endif // __aarch64__ case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); + DepthwiseConv3x3(param); break; case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); diff --git a/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp index 9b5f87b1a776b741be17cbca8cae3b15259a7253..748845e23e9290f9e40f7d63137be3b811bafebd 100644 --- a/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "operators/kernel/arm/convolution/conv_common.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" +#include "operators/math/channel_wise.h" namespace paddle_mobile { namespace operators { @@ -61,37 +62,28 @@ void DWConvBNReluKernel::Compute( const FusionDWConvBNReluParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: - math::DepthwiseConv3x3S1(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); - break; case ConvParam::EXEC_DEPTHWISE3x3S2_FLOAT: - math::DepthwiseConv3x3S2(*param.Input(), *param.Filter(), - param.Paddings(), param.Output()); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); + DepthwiseConv3x3(param); break; #ifndef __aarch64__ case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: DepthwiseConv5x5(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); - math::ScaleAddChannelWise(param.Output(), param.NewScale(), - param.NewBias(), param.Output()); break; #endif // __aarch64__ case ConvParam::EXEC_GEMM_FLOAT: - ConvBNReluBasic>(param); + GemmConv(param); break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); } + math::ScaleAddChannelWise(param.Output(), param.NewScale(), + param.NewBias(), param.Output()); } + template class DWConvBNReluKernel; } // namespace operators diff --git a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h deleted file mode 100644 index 4c9ca6e3e8ef995e9cce6f565aafece17ac51b10..0000000000000000000000000000000000000000 --- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDADDPRELU_OP -#pragma once - -#include -#include -#include "operators/math/conv_func.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - Tensor bias1 = *param.Bias1(); - Tensor *output = param.Output(); - output->mutable_data(); - float *biase_data = bias.data(); - - int axis = param.Axis(); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - Tensor aa = *param.InputAlpha(); - float *p = aa.data(); - - std::string mode = param.Mode(); - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor bias1_batch = bias1.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step); - float *biase_data1 = bias1_slice.data(); - math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice, - p, mode, biase_data, biase_data1); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_CONVADDADDPRELU_OP diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h deleted file mode 100644 index 24b35229b31151348475a293611cb4402999b3b7..0000000000000000000000000000000000000000 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADD_OP -#pragma once - -#include -#include "operators/math/conv_func.h" -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -void ConvAddBasic(const FusionConvAddParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - Tensor *output = param.Output(); - output->mutable_data(); - float *biase_data = bias.data(); - - int axis = param.Axis(); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::MatMul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), false, biase_data); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h deleted file mode 100644 index d11a8442acdd275c95aaa96b2c3e1855e44746e9..0000000000000000000000000000000000000000 --- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVADDPRELU_OP -#pragma once - -#include -#include -#include "operators/math/conv_func.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void ConvAddPReluCompute(const FusionConvAddPReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - Tensor *output = param.Output(); - output->mutable_data(); - float *biase_data = bias.data(); - - int axis = param.Axis(); - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - Tensor aa = *param.InputAlpha(); - float *p = aa.data(); - std::string mode = param.Mode(); - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice, - p, mode, biase_data, nullptr); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif // FUSION_CONVADDPRELU_OP diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/src/operators/kernel/central-arm-func/conv_arm_func.cpp new file mode 100644 index 0000000000000000000000000000000000000000..495963d470457513dd70489bb04f4de46ffdedcc --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp @@ -0,0 +1,242 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/central-arm-func/conv_arm_func.h" +#include +#include "operators/math/depthwise_conv3x3.h" +#include "operators/math/depthwise_conv5x5.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/pad.h" +#include "operators/math/vol2col.h" +#include "operators/math/winograd/winograd_transform.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +int ConvOutputSize(int input_size, int filter_size, int dilation, int padding, + int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + return output_size; +} + +bool IsExpand(const std::vector &filter_dim, + const std::vector &strides, const std::vector &paddings, + const std::vector &dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +template +void GemmConv(const ConvParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor *output = param.Output(); + output->mutable_data(); + + int groups = param.Groups(); + const std::vector strides = param.Strides(); + const std::vector paddings = param.Paddings(); + const std::vector dilations = param.Dilations(); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + const int batch_size = static_cast(input->dims()[0]); + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + // col_matrix.ShareDataWith(in_slice); + col_matrix = in_slice; + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + math::MatMul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(0), false, + static_cast(nullptr)); + } + } +} + +template +void WinogradConv3x3(const ConvParam ¶m) { + const Tensor *input = param.Input(); + const Tensor *filter = param.transformed_filter_; + Tensor *output = param.Output(); + output->mutable_data(); + int batch_size = input->dims()[0]; + int groups = param.Groups(); + const std::vector &paddings = param.Paddings(); + + auto winograd_pad = [&](int width, int pad) { + int output_tile = tile - kernel + 1; + // int tiles = (width + pad - kernel) / output_tile + 1; + // return (tiles - 1) * output_tile + tile - width; + int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile; + return pad_width + tile - width; + }; + + math::PadFunctor pad; + Tensor input_pad; + framework::Tensor transformed_input; + for (int i = 0; i < batch_size; ++i) { + Tensor in_batch = input->Slice(i, i + 1); + Tensor out_batch = output->Slice(i, i + 1); + // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]); + // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]); + int pad_bottom = paddings[0]; + int pad_right = paddings[1]; + if (paddings[0] || paddings[1] || pad_bottom || pad_right) { + framework::DDim pad_shape = in_batch.dims(); + pad_shape[2] += paddings[0] + pad_bottom; + pad_shape[3] += paddings[1] + pad_right; + input_pad.mutable_data(pad_shape); + pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right, + &input_pad); + } else { + input_pad = in_batch; + } + // tile input and transform + math::winograd_transform_input(input_pad, &transformed_input); + // caculate output + math::winograd_transform_output(transformed_input, *filter, + output); + } +} + +template +void DepthwiseConv3x3(const ConvParam ¶m) { + const Tensor *input = param.Input(); + const Tensor *filter = param.Filter(); + const std::vector &paddings = param.Paddings(); + const std::vector &strides = param.Strides(); + const int batch_size = input->dims()[0]; + Tensor *output = param.Output(); + output->mutable_data(); + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1); + Tensor out_batch = output->Slice(i, i + 1); + if (strides[0] == 1) { + math::DepthwiseConv3x3S1(in_batch, *filter, paddings, + &out_batch); + } else if (strides[0] == 2) { + math::DepthwiseConv3x3S2(in_batch, *filter, paddings, + &out_batch); + } else { + GemmConv(param); + } + } +} + +template +void DepthwiseConv5x5(const ConvParam ¶m) { + const Tensor *input = param.Input(); + const Tensor *filter = param.Filter(); + const std::vector &paddings = param.Paddings(); + const std::vector &strides = param.Strides(); + const int batch_size = input->dims()[0]; + Tensor *output = param.Output(); + output->mutable_data(); + + // if (strides[0] == 1) { + // for (int i = 0; i < batch_size; i++) { + // Tensor in_batch = input->Slice(i, i + 1); + // Tensor out_batch = output->Slice(i, i + 1); + // math::DepthwiseConv5x5S1(in_batch, *filter, paddings, + // &out_batch); + // } + // } else { + GemmConv(param); + // } +} + +template void GemmConv(const ConvParam ¶m); +template void WinogradConv3x3<8, 3>(const ConvParam ¶m); +template void DepthwiseConv3x3(const ConvParam ¶m); +template void DepthwiseConv5x5(const ConvParam ¶m); + +#ifndef __aarch64__ +template void GemmConv(const ConvParam ¶m); +template void DepthwiseConv3x3(const ConvParam ¶m); +template void DepthwiseConv5x5(const ConvParam ¶m); +#endif + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index b527c0bad9ad295d76d11be683a34492d4b0d5d9..52bcbbb7c6f76e7e68da4c8a10271bb1bac35adf 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -15,386 +15,31 @@ limitations under the License. */ #ifdef CONV_OP #pragma once + #include -#include "operators/math/conv_func.h" -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/depthwise_conv5x5.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/pad.h" -#include "operators/math/vol2col.h" -#include "operators/math/winograd/winograd_transform.h" #include "operators/op_param.h" namespace paddle_mobile { namespace operators { -template -inline void GemmConv(const ConvParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - const std::vector strides = param.Strides(); - const std::vector paddings = param.Paddings(); - const std::vector dilations = param.Dilations(); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - const int batch_size = static_cast(input->dims()[0]); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - // col_matrix.ShareDataWith(in_slice); - col_matrix = in_slice; - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } +int ConvOutputSize(int input_size, int filter_size, int dilation, int padding, + int stride); - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); +bool IsExpand(const std::vector &filter_dim, + const std::vector &strides, const std::vector &paddings, + const std::vector &dilations); - math::MatMul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0), false, - static_cast(nullptr)); - } - } -} +template +void GemmConv(const ConvParam ¶m); template -inline void WinogradConv3x3(const ConvParam ¶m) { - const Tensor *input = param.Input(); - const Tensor *filter = param.transformed_filter_; - Tensor *output = param.Output(); - output->mutable_data(); - int batch_size = input->dims()[0]; - int groups = param.Groups(); - const std::vector &paddings = param.Paddings(); - - auto winograd_pad = [&](int width, int pad) { - int output_tile = tile - kernel + 1; - // int tiles = (width + pad - kernel) / output_tile + 1; - // return (tiles - 1) * output_tile + tile - width; - int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile; - return pad_width + tile - width; - }; +void WinogradConv3x3(const ConvParam ¶m); - math::PadFunctor pad; - Tensor input_pad; - framework::Tensor transformed_input; - for (int i = 0; i < batch_size; ++i) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]); - // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]); - int pad_bottom = paddings[0]; - int pad_right = paddings[1]; - if (paddings[0] || paddings[1] || pad_bottom || pad_right) { - framework::DDim pad_shape = in_batch.dims(); - pad_shape[2] += paddings[0] + pad_bottom; - pad_shape[3] += paddings[1] + pad_right; - input_pad.mutable_data(pad_shape); - pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right, - &input_pad); - } else { - input_pad = in_batch; - } - // tile input and transform - math::winograd_transform_input(input_pad, &transformed_input); - // caculate output - math::winograd_transform_output(transformed_input, *filter, - output); - } -} - -#ifndef __aarch64__ -// int8 DepthwiseConv3x3 template -inline void DepthwiseConv3x3(const ConvParam ¶m) { - const Tensor *input = param.Input(); - const Tensor *filter = param.Filter(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - const int batch_size = input->dims()[0]; - Tensor *output = param.Output(); - output->mutable_data(); - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - if (strides[0] == 1) { - math::DepthwiseConv3x3S1(in_batch, *filter, paddings, - &out_batch); - } else if (strides[0] == 2) { - math::DepthwiseConv3x3S2(in_batch, *filter, paddings, - &out_batch); - } else { - GemmConv(param); - } - } -} -#endif // __aarch64__ +void DepthwiseConv3x3(const ConvParam ¶m); template -inline void DepthwiseConv5x5(const ConvParam ¶m) { - const Tensor *input = param.Input(); - const Tensor *filter = param.Filter(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - const int batch_size = input->dims()[0]; - Tensor *output = param.Output(); - output->mutable_data(); - - // if (strides[0] == 1) { - // for (int i = 0; i < batch_size; i++) { - // Tensor in_batch = input->Slice(i, i + 1); - // Tensor out_batch = output->Slice(i, i + 1); - // math::DepthwiseConv5x5S1(in_batch, *filter, paddings, - // &out_batch); - // } - // } else { - GemmConv(param); - // } -} - -template -void ConvAddReluBasic(const ParamType ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); - - Tensor *output = param.Output(); - output->mutable_data(); - - float alpha = 1.0f; - float beta = 1.0f; - int32_t groups = param.Groups(); - int32_t axis = param.Axis(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int32_t batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int32_t in_step = static_cast(input->dims()[1]) / groups; - int32_t out_step = static_cast(output->dims()[1]) / groups; - - float *bias_data = bias.data(); - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int32_t i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int32_t g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix = in_slice; - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - math::MatMul(filter_slice, false, col_matrix, false, alpha, - &out_slice, beta, true, bias_data); - } - } -} - -template -void ConvBNReluBasic(const ParamType ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor new_bias = *param.NewBias(); - Tensor new_scale = *param.NewScale(); - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix = in_slice; - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - math::MatMulWithBn(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0), true, &new_scale, &new_bias, g); - } - } -} +void DepthwiseConv5x5(const ConvParam ¶m); } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h deleted file mode 100644 index 9e32d20291a7aa364eb003225de7a6d9ff45d03e..0000000000000000000000000000000000000000 --- a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef FUSION_CONVBNADDRELU_OP - -#pragma once - -#include -#include "operators/math/depthwise_conv3x3.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { -void ConvBNAddReluBasic(const FusionConvBNAddReluParam ¶m) { - const Tensor *input = param.Input(); - Tensor filter = *param.Filter(); - Tensor new_bias = *param.NewBias(); - Tensor new_scale = *param.NewScale(); - Tensor *bias1 = param.Bias(); - Tensor *output = param.Output(); - output->mutable_data(); - - int groups = param.Groups(); - std::vector strides = param.Strides(); - std::vector paddings = param.Paddings(); - std::vector dilations = param.Dilations(); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(framework::vectorize(filter.dims())); - - std::vector output_shape_vec(framework::vectorize(output->dims())); - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = input->dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(framework::make_ddim(col_shape_vec)); - - framework::DDim col_matrix_shape = - framework::flatten_to_2d(col_shape, data_dim + 1); - - bool is_expand = - math::IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col.mutable_data(col_shape); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim input_shape = framework::slice_ddim( - input->dims(), 1, static_cast(input->dims().size())); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - output->dims()[1], - output->numel() / (output->dims()[0] * output->dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(input->dims()[1]) / groups; - int out_step = static_cast(output->dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - // im2col - im2col(in_slice, dilations, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, - &col); - } else if (data_dim == 3U) { - // vol2col - vol2col(in_slice, dilations, strides, paddings, &col); - } - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); - math::MatMulWithBn(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(1), true, &new_scale, &new_bias, g, - bias_data.data()); - } - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h deleted file mode 100644 index fadaf7564ceeb7a52215dc335135016be02bc1ab..0000000000000000000000000000000000000000 --- a/src/operators/kernel/conv_add_add_prelu_kernel.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVADDADDPRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/conv_func.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvAddAddPReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvAddAddPReluParam ¶m); - bool Init(FusionConvAddAddPReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h index 7a921ecc7d0f4498cae80fbb9cea1b13e4c94101..757664eb536f871811964608c8ad709c416d126c 100644 --- a/src/operators/kernel/conv_add_bn_kernel.h +++ b/src/operators/kernel/conv_add_bn_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h index 3f088528fc901987873038c7e1dd779dcc2019e7..919c66106eda1159f14c40e768325f1f5dcf5ff6 100644 --- a/src/operators/kernel/conv_add_bn_relu_kernel.h +++ b/src/operators/kernel/conv_add_bn_relu_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h index 140d0475a8ee2f017a7c587c38429ccbb2edd387..fd3f279a7829a5803da6e08c0280435443425ad0 100644 --- a/src/operators/kernel/conv_add_kernel.h +++ b/src/operators/kernel/conv_add_kernel.h @@ -23,7 +23,6 @@ limitations under the License. */ #include "common/common.h" #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/depthwise_conv3x3.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h deleted file mode 100644 index 631982789b09c57d0d21186d0a30df7368d2955f..0000000000000000000000000000000000000000 --- a/src/operators/kernel/conv_add_prelu_kernel.h +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef FUSION_CONVADDPRELU_OP - -#include -#include "framework/ddim.h" -#include "framework/operator.h" -#include "operators/math/conv_func.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -using framework::DDim; -using framework::OpKernelBase; - -template -class ConvAddPReluKernel - : public OpKernelBase> { - public: - void Compute(const FusionConvAddPReluParam ¶m); - bool Init(FusionConvAddPReluParam *param); -}; - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h index e001926b361da96ec3ff76e120bc3d1ad13714fa..8cfc92ef19937650f1835e16eb26c1bf59f2d345 100644 --- a/src/operators/kernel/conv_add_relu_kernel.h +++ b/src/operators/kernel/conv_add_relu_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/conv_bn_add_relu_kernel.h b/src/operators/kernel/conv_bn_add_relu_kernel.h index dcd8fecf07fbb4ea75b382f5315e24e64e26e939..63a86b56538a259b783a6a99536b6c5be15d915a 100644 --- a/src/operators/kernel/conv_bn_add_relu_kernel.h +++ b/src/operators/kernel/conv_bn_add_relu_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/conv_bn_kernel.h b/src/operators/kernel/conv_bn_kernel.h index e669f3bdd85dbd89e3a48d417dcd0cd6b9706062..1fb0d680cf4584e2433af254cca25bc52a3b9e03 100644 --- a/src/operators/kernel/conv_bn_kernel.h +++ b/src/operators/kernel/conv_bn_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h index 91b3413116ae22a8e212cf149c4e0c2a8924664a..f63b61ab09f90c8c40738cbe94ec6ebcff9420ff 100644 --- a/src/operators/kernel/conv_bn_relu_kernel.h +++ b/src/operators/kernel/conv_bn_relu_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h index f2e4c0afbd0aaafff5339816764f9e30592f122c..3bd8093adb539d8fc0f6ea4b400b9ff864e1b664 100644 --- a/src/operators/kernel/dwconv_bn_relu_kernel.h +++ b/src/operators/kernel/dwconv_bn_relu_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "framework/ddim.h" #include "framework/operator.h" -#include "operators/math/conv_func.h" #include "operators/math/im2col.h" #include "operators/math/math_function.h" #include "operators/math/vol2col.h" diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h index 99dbfe2d658cde17e6399f8ea4bc5b945092cde5..486c828acab6d24741baae5804f09bc3e850b02f 100644 --- a/src/operators/kernel/lrn_kernel.h +++ b/src/operators/kernel/lrn_kernel.h @@ -15,24 +15,21 @@ limitations under the License. */ #pragma once #ifdef LRN_OP + +#include #ifdef _OPENMP #include #endif -#include "framework/operator.h" -#include "operators/op_param.h" - -#include - #ifdef __ARM_NEON -#include "arm_neon.h" -#include "operators/math/math_func_neon.h" +#include +#include "operators/math/math.h" #endif +#include "framework/operator.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { -using namespace framework; - template struct LRNFunctor { void operator()(const framework::Tensor &input, framework::Tensor *out, int N, diff --git a/src/operators/math/activation.h b/src/operators/math/activation.h index 90b9ab4c3a558a994370ea80693e1d31687bb44e..fb90a35516d8c461a05328f65bce24a2b8aa519f 100644 --- a/src/operators/math/activation.h +++ b/src/operators/math/activation.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "common/types.h" #if defined(__ARM_NEON__) || defined(__ARM_NEON) #include -#include "operators/math/math_func_neon.h" +#include "operators/math/math.h" #endif namespace paddle_mobile { diff --git a/src/operators/math/conv_func.h b/src/operators/math/channel_wise.h similarity index 65% rename from src/operators/math/conv_func.h rename to src/operators/math/channel_wise.h index 4debd2e105856905b13bee55c455e9d263d26fe9..796ea6d2b97d31d3091b225601065ee4670316e8 100644 --- a/src/operators/math/conv_func.h +++ b/src/operators/math/channel_wise.h @@ -14,91 +14,16 @@ limitations under the License. */ #pragma once -#include +#include "framework/tensor.h" +#include "operators/math/activation.h" #ifdef __ARM_NEON #include #endif -#include "framework/ddim.h" -#include "framework/tensor.h" -#include "operators/math/activation.h" namespace paddle_mobile { namespace operators { namespace math { -using framework::DDim; -using framework::Tensor; - -inline int ConvOutputSize(int input_size, int filter_size, int dilation, - int padding, int stride) { - const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; - return output_size; -} - -inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) { // NOLINT - const auto bias_ptr = bias.data(); - const DDim bias_ddim = bias.dims(); - PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1, - "the bias tensor's dims size != 1") - DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1); - DDim inner_ddim = - paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size()); - int outer_size = paddle_mobile::framework::product(outer_ddim); - int inner_size = paddle_mobile::framework::product(inner_ddim); - bias.Resize(dDim); - auto new_ptr = bias.mutable_data(); - int axis_size = dDim[axis]; - -#ifdef __ARM_NEON - for (int i = 0; i < outer_size; ++i) { - int inner_num = inner_size >> 4; - int remain = inner_size - (inner_num << 4); - float v_bias = bias_ptr[i * axis_size / outer_size]; - for (; inner_num > 0; inner_num--) { - float32x4_t v_newptr1 = vdupq_n_f32(v_bias); - float32x4_t v_newptr2 = vdupq_n_f32(v_bias); - float32x4_t v_newptr3 = vdupq_n_f32(v_bias); - float32x4_t v_newptr4 = vdupq_n_f32(v_bias); - vst1q_f32(new_ptr, v_newptr1); - new_ptr += 4; - vst1q_f32(new_ptr, v_newptr2); - new_ptr += 4; - vst1q_f32(new_ptr, v_newptr3); - new_ptr += 4; - vst1q_f32(new_ptr, v_newptr4); - new_ptr += 4; - } - for (; remain > 0; remain--) { - *new_ptr = v_bias; - new_ptr++; - } - } -#else - for (int i = 0; i < outer_size; ++i) { - float v_bias = bias_ptr[i * axis_size / outer_size]; - for (int j = 0; j < inner_size; ++j) { - new_ptr[i * inner_size + j] = v_bias; - } - } -#endif -} - -inline bool IsExpand(const std::vector &filter_dim, - const std::vector &strides, - const std::vector &paddings, - const std::vector &dilations) { - bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; - for (size_t j = 0; j < strides.size(); ++j) { - filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); - strides_1 = strides_1 && (strides[j] == 1); - padding_0 = padding_0 && (paddings[j] == 0); - dilation_1 = dilation_1 && (dilations[j] == 1); - } - - return !(filter_1 && strides_1 && padding_0 && dilation_1); -} - template void AddChannelWise(const framework::Tensor *input, const framework::Tensor *bias, framework::Tensor *output) { diff --git a/src/operators/math/depthwise_conv3x3.h b/src/operators/math/depthwise_conv3x3.h index 9b9c5c0a6d13dece6b3d8462de6a0c2630688cdf..1f145c4f94bf2061fb9db74aec84684387809854 100644 --- a/src/operators/math/depthwise_conv3x3.h +++ b/src/operators/math/depthwise_conv3x3.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include "framework/tensor.h" -#include "operators/math/conv_func.h" namespace paddle_mobile { namespace operators { diff --git a/src/operators/math/depthwise_conv3x3_int8.cpp b/src/operators/math/depthwise_conv3x3_int8.cpp index 91e682c14590a10fc393aaefb5d37c015065fc0a..76262c76fb4e6687340985c75d08b688dded1cff 100644 --- a/src/operators/math/depthwise_conv3x3_int8.cpp +++ b/src/operators/math/depthwise_conv3x3_int8.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(__ARM_NEON__) && !defined(__aarch64__) +#if defined(__ARM_NEON__) || defined(__ARM_NEON) #include #include "operators/math/depthwise_conv3x3.h" @@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter, DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start) // middle int remain_start = valid_w_start; -#ifdef __ARM_NEON__ int output_tiles = (valid_w_end - valid_w_start) / 6; remain_start = valid_w_start + output_tiles * 6; int32x4_t _sum0, _sum1; @@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter, vst1q_s32(output_ptr + output_offset, _sum0); vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1)); } -#endif // __ARM_NEON__ for (int w = remain_start; w < valid_w_end; ++w) { int32_t value = 0; int input_start = -padding_w + w * Stride_w; @@ -215,6 +213,8 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, output_ptr2 += valid_w_start; output_ptr3 += valid_w_start; } +#if __aarch64__ +#else // valid int loop = output_w_tiles; asm volatile( @@ -525,6 +525,7 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); +#endif // __aarch64__ // pad right if (padding_w) { int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2))); @@ -618,7 +619,9 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, output_ptr0 += valid_w_start; output_ptr1 += valid_w_start; } - // valid + // valid +#if __aarch64__ +#else int loop = output_w_tiles; asm volatile( "cmp %[loop], #0 \n" @@ -804,6 +807,7 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); +#endif // __aarch64__ // pad right if (padding_w) { int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2))); @@ -869,7 +873,9 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, } output_ptr0 += valid_w_start; } - // valid + // valid +#if __aarch64__ +#else int loop = output_w_tiles; asm volatile( "cmp %[loop], #0 \n" @@ -993,6 +999,7 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); +#endif // __aarch64__ // pad right if (padding_w) { int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2))); @@ -1152,7 +1159,9 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, output_ptr1 += valid_w_start; output_ptr2 += valid_w_start; } - // valid + // valid +#if __aarch64__ +#else int loop = output_w_tiles; asm volatile( "cmp %[loop], #0 \n" @@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); +#endif // __aarch64__ // pad right if (padding_w > 0) { int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); @@ -1490,7 +1500,9 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, input_ptr2 += valid_input_w_start; output_ptr0 += valid_w_start; } - // valid + // valid +#if __aarch64__ +#else int loop = output_w_tiles; asm volatile( "cmp %[loop], #0 \n" @@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1) : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); +#endif // __aarch64__ // pad right if (padding_w > 0) { int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0))); @@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, } // namespace operators } // namespace paddle_mobile -#endif +#endif // __ARM_NEON__ diff --git a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp deleted file mode 100644 index e2c01838442b01dee10cd8d85126429277d8c672..0000000000000000000000000000000000000000 --- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined(__ARM_NEON__) && defined(__aarch64__) - -#include "operators/math/depthwise_conv3x3.h" -#ifdef __ARM_NEON__ -#include -#endif - -namespace paddle_mobile { -namespace operators { -namespace math { - -// template<> -// void DepthwiseConv3x3( -// const framework::Tensor *input, const framework::Tensor *filter, -// const std::vector &strides, framework::Tensor *output) { -// PADDLE_MOBILE_THROW_EXCEPTION( -// "Depthwise conv with generic strides has not been implemented."); -// } - -template <> -void DepthwiseConv3x3S1(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - PADDLE_MOBILE_THROW_EXCEPTION( - "Depthwise conv3x3 with stride 1 for arm v8 has not been implemented."); -} - -template <> -void DepthwiseConv3x3S2(const framework::Tensor &input, - const framework::Tensor &filter, - const std::vector &paddings, - framework::Tensor *output) { - PADDLE_MOBILE_THROW_EXCEPTION( - "Depthwise conv3x3 with stride 2 for arm v8 has not been implemented."); -} - -} // namespace math -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/math/depthwise_conv5x5.h b/src/operators/math/depthwise_conv5x5.h index d047bbfa1ac179e0ef0b1b6705e349890b25e800..11d96b078ac7314ef0f3de98614c1e4ebd4dbc95 100644 --- a/src/operators/math/depthwise_conv5x5.h +++ b/src/operators/math/depthwise_conv5x5.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include "framework/tensor.h" -#include "operators/math/conv_func.h" namespace paddle_mobile { namespace operators { diff --git a/src/operators/math/gemm/pack_kernel.h b/src/operators/math/gemm/pack_kernel.h index 31342ec1b7d504869c16a6a2a9a0f1491be4a3c3..598bf3248d2cb56c3324efa6858b3d045e7a2a3c 100644 --- a/src/operators/math/gemm/pack_kernel.h +++ b/src/operators/math/gemm/pack_kernel.h @@ -31,345 +31,239 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) { void pack_lhs_6r(const int m, const int k, const float *A, const int lda, float *output, const bool unroll) { - float *zero = new float[k]; - memset(zero, 0, k * sizeof(float)); + uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5}; + int remain_k = k & 0x3; + uint32x4_t vzero = vdupq_n_u32(0); + uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k)); - const int m_tail = m % 6; - const int i_length = m - m_tail; - for (int i = 0; i < i_length; i += 6) { + #pragma omp parallel for if (unroll) + for (int i = 0; i < m - 5; i += 6) { const float *a0 = A + i * lda; const float *a1 = A + (i + 1) * lda; const float *a2 = A + (i + 2) * lda; const float *a3 = A + (i + 3) * lda; const float *a4 = A + (i + 4) * lda; const float *a5 = A + (i + 5) * lda; - float *local_buffer = output + i * k; - for (int j = 0; j < k; ++j) { - *local_buffer++ = *a0++; - *local_buffer++ = *a1++; - *local_buffer++ = *a2++; - *local_buffer++ = *a3++; - *local_buffer++ = *a4++; - *local_buffer++ = *a5++; + float *out_ptr = output + i * k; + + int loops = k >> 2; + if (loops > 0) { +#if __aarch64__ + for (int l = 0; l < loops; ++l) { + float32x4_t _d0 = vld1q_f32(a0); + float32x4_t _d1 = vld1q_f32(a1); + float32x4_t _d2 = vld1q_f32(a2); + float32x4_t _d3 = vld1q_f32(a3); + float32x4_t _d4 = vld1q_f32(a4); + float32x4_t _d5 = vld1q_f32(a5); + + float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); + _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); + _d2 = + vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); + _d3 = + vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); + + vst1q_f32(out_ptr, _d0); + vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); + vst1q_f32(out_ptr + 6, _d1); + vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); + vst1q_f32(out_ptr + 12, _d2); + vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); + vst1q_f32(out_ptr + 18, _d3); + vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1])); + + a0 += 4; + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + a5 += 4; + out_ptr += 24; + } +#else + asm volatile( + "loop_4k_%=: \n" + "vld1.32 {d0-d1}, [%[a0]]! \n" + "vld1.32 {d2-d3}, [%[a1]]! \n" + "vld1.32 {d4-d5}, [%[a2]]! \n" + "vld1.32 {d6-d7}, [%[a3]]! \n" + "vld1.32 {d8-d9}, [%[a4]]! \n" + "vld1.32 {d10-d11}, [%[a5]]! \n" + "vtrn.32 q0, q1 \n" + "vtrn.32 q2, q3 \n" + "vtrn.32 q4, q5 \n" + "vswp.32 d1, d4 \n" + "vswp.32 d3, d6 \n" + + "vst1.32 {q0}, [%[out]]! \n" + "vst1.32 {d8}, [%[out]]! \n" + "vst1.32 {q1}, [%[out]]! \n" + "vst1.32 {d10}, [%[out]]! \n" + "vst1.32 {q2}, [%[out]]! \n" + "vst1.32 {d9}, [%[out]]! \n" + "vst1.32 {q3}, [%[out]]! \n" + "vst1.32 {d11}, [%[out]]! \n" + + "subs %[loops], #1 \n" + "bne loop_4k_%= \n" + : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), + [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); +#endif + } + + if (remain_k > 0) { + float32x4_t _d0 = vld1q_f32(a0); + float32x4_t _d1 = vld1q_f32(a1); + float32x4_t _d2 = vld1q_f32(a2); + float32x4_t _d3 = vld1q_f32(a3); + float32x4_t _d4 = vld1q_f32(a4); + float32x4_t _d5 = vld1q_f32(a5); + + _d0 = vandq_f32_u32(_d0, vmask1); + _d1 = vandq_f32_u32(_d1, vmask1); + _d2 = vandq_f32_u32(_d2, vmask1); + _d3 = vandq_f32_u32(_d3, vmask1); + _d4 = vandq_f32_u32(_d4, vmask1); + _d5 = vandq_f32_u32(_d5, vmask1); + + float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); + _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); + _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); + + switch (remain_k) { + case 3: + vst1q_f32(out_ptr + 12, _d2); + vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); + case 2: + vst1q_f32(out_ptr + 6, _d1); + vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); + case 1: + vst1q_f32(out_ptr, _d0); + vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); + default: + break; + } } } - if (m_tail != 0) { - const float *a0 = A + i_length * lda; + + int remain_m = m % 6; + if (remain_m) { + int remain_m_start = m - remain_m; + const float *a0 = A + remain_m_start * lda; const float *a1 = a0 + lda; const float *a2 = a0 + 2 * lda; const float *a3 = a0 + 3 * lda; const float *a4 = a0 + 4 * lda; const float *a5 = a0 + 5 * lda; - float *local_buffer = output + i_length * k; - switch (m_tail) { - case 1: - a1 = zero; - case 2: - a2 = zero; - case 3: - a3 = zero; - case 4: - a4 = zero; - case 5: - a5 = zero; - break; - default: - break; + float *out_ptr = output + remain_m_start * k; + + uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m)); + uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m)); + const float zerobuff[4] = {0.f, 0.f, 0.f, 0.f}; + + int lk = 0; + for (; lk < k - 3; lk += 4) { + switch (remain_m) { + case 1: + a1 = zerobuff; + case 2: + a2 = zerobuff; + case 3: + a3 = zerobuff; + case 4: + a4 = zerobuff; + case 5: + a5 = zerobuff; + default: + break; + } +#if __aarch64__ + float32x4_t _d0 = vld1q_f32(a0); + float32x4_t _d1 = vld1q_f32(a1); + float32x4_t _d2 = vld1q_f32(a2); + float32x4_t _d3 = vld1q_f32(a3); + float32x4_t _d4 = vld1q_f32(a4); + float32x4_t _d5 = vld1q_f32(a5); + + float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); + float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); + float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); + _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0])); + _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); + _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); + _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1])); + + _d0 = vandq_f32_u32(_d0, vmask2); + _d1 = vandq_f32_u32(_d1, vmask2); + _d2 = vandq_f32_u32(_d2, vmask2); + _d3 = vandq_f32_u32(_d3, vmask2); + _d4 = vandq_f32_u32(_q3.val[0], vmask3); + _d5 = vandq_f32_u32(_q3.val[1], vmask3); + + vst1q_f32(out_ptr, _d0); + vst1_f32(out_ptr + 4, vget_low_f32(_d4)); + vst1q_f32(out_ptr + 6, _d1); + vst1_f32(out_ptr + 10, vget_low_f32(_d5)); + vst1q_f32(out_ptr + 12, _d2); + vst1_f32(out_ptr + 16, vget_high_f32(_d4)); + vst1q_f32(out_ptr + 18, _d3); + vst1_f32(out_ptr + 22, vget_high_f32(_d5)); + + out_ptr += 24; +#else + asm volatile( + "vld1.32 {d0-d1}, [%[a0]] \n" + "vld1.32 {d2-d3}, [%[a1]] \n" + "vld1.32 {d4-d5}, [%[a2]] \n" + "vld1.32 {d6-d7}, [%[a3]] \n" + "vld1.32 {d8-d9}, [%[a4]] \n" + "vld1.32 {d10-d11}, [%[a5]] \n" + "vtrn.32 q0, q1 \n" + "vtrn.32 q2, q3 \n" + "vtrn.32 q4, q5 \n" + "vswp.32 d1, d4 \n" + "vswp.32 d3, d6 \n" + + "vbif q0, %q[vzero], %q[vmask2] \n" + "vbif q1, %q[vzero], %q[vmask2] \n" + "vbif q2, %q[vzero], %q[vmask2] \n" + "vbif q3, %q[vzero], %q[vmask2] \n" + "vbif q4, %q[vzero], %q[vmask3] \n" + "vbif q5, %q[vzero], %q[vmask3] \n" + + "vst1.32 {q0}, [%[out]]! \n" + "vst1.32 {d8}, [%[out]]! \n" + "vst1.32 {q1}, [%[out]]! \n" + "vst1.32 {d10}, [%[out]]! \n" + "vst1.32 {q2}, [%[out]]! \n" + "vst1.32 {d9}, [%[out]]! \n" + "vst1.32 {q3}, [%[out]]! \n" + "vst1.32 {d11}, [%[out]]! \n" + : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2), + [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5) + : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); +#endif } - for (int j = 0; j < k; ++j) { - *local_buffer++ = *a0++; - *local_buffer++ = *a1++; - *local_buffer++ = *a2++; - *local_buffer++ = *a3++; - *local_buffer++ = *a4++; - *local_buffer++ = *a5++; + // remain k + for (; lk < k; ++lk) { + *out_ptr++ = *a0++; + *out_ptr++ = *a1++; + *out_ptr++ = *a2++; + *out_ptr++ = *a3++; + *out_ptr++ = *a4++; + *out_ptr++ = *a5++; } - delete[] zero; } - - // uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5}; - // int remain_k = k & 0x3; - // uint32x4_t vzero = vdupq_n_u32(0); - // uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k)); - // - // std::cout << "m: " << m << ", k: " << k << std::endl; - // #pragma omp parallel for if (unroll) - // for (int i = 0; i < m - 5; i += 6) { - // std::cout << "i: " << i << std::endl; - // const float *a0 = A + i * lda; - // const float *a1 = A + (i + 1) * lda; - // const float *a2 = A + (i + 2) * lda; - // const float *a3 = A + (i + 3) * lda; - // const float *a4 = A + (i + 4) * lda; - // const float *a5 = A + (i + 5) * lda; - // float *out_ptr = output + i * k; - // - // int loops = k >> 2; - // if (loops > 0) { - // #if __aarch64__ - // for (int l = 0; l < loops; ++l) { - // float32x4_t _d0 = vld1q_f32(a0); - // float32x4_t _d1 = vld1q_f32(a1); - // float32x4_t _d2 = vld1q_f32(a2); - // float32x4_t _d3 = vld1q_f32(a3); - // float32x4_t _d4 = vld1q_f32(a4); - // float32x4_t _d5 = vld1q_f32(a5); - // - // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), - // vget_low_f32(_q1.val[0])); _d1 = - // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - // _d2 = - // vcombine_f32(vget_high_f32(_q0.val[0]), - // vget_high_f32(_q1.val[0])); - // _d3 = - // vcombine_f32(vget_high_f32(_q0.val[1]), - // vget_high_f32(_q1.val[1])); - // - // vst1q_f32(out_ptr, _d0); - // vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - // vst1q_f32(out_ptr + 6, _d1); - // vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - // vst1q_f32(out_ptr + 12, _d2); - // vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - // vst1q_f32(out_ptr + 18, _d3); - // vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1])); - // - // a0 += 4; - // a1 += 4; - // a2 += 4; - // a3 += 4; - // a4 += 4; - // a5 += 4; - // out_ptr += 24; - // } - // #else - // asm volatile( - // "loop_4k_%=: \n" - // "vld1.32 {d0-d1}, [%[a0]]! \n" - // "vld1.32 {d2-d3}, [%[a1]]! \n" - // "vld1.32 {d4-d5}, [%[a2]]! \n" - // "vld1.32 {d6-d7}, [%[a3]]! \n" - // "vld1.32 {d8-d9}, [%[a4]]! \n" - // "vld1.32 {d10-d11}, [%[a5]]! \n" - // "vtrn.32 q0, q1 \n" - // "vtrn.32 q2, q3 \n" - // "vtrn.32 q4, q5 \n" - // "vswp.32 d1, d4 \n" - // "vswp.32 d3, d6 \n" - // - // "vst1.32 {q0}, [%[out]]! \n" - // "vst1.32 {d8}, [%[out]]! \n" - // "vst1.32 {q1}, [%[out]]! \n" - // "vst1.32 {d10}, [%[out]]! \n" - // "vst1.32 {q2}, [%[out]]! \n" - // "vst1.32 {d9}, [%[out]]! \n" - // "vst1.32 {q3}, [%[out]]! \n" - // "vst1.32 {d11}, [%[out]]! \n" - // - // "subs %[loops], #1 \n" - // "bne loop_4k_%= \n" - // : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] - // "+r"(a2), - // [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - // : - // : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); - // #endif - // } - // - // if (remain_k > 0) { - // float32x4_t _d0 = vld1q_f32(a0); - // float32x4_t _d1 = vld1q_f32(a1); - // float32x4_t _d2 = vld1q_f32(a2); - // float32x4_t _d3 = vld1q_f32(a3); - // float32x4_t _d4 = vld1q_f32(a4); - // float32x4_t _d5 = vld1q_f32(a5); - // - // _d0 = vandq_f32_u32(_d0, vmask1); - // _d1 = vandq_f32_u32(_d1, vmask1); - // _d2 = vandq_f32_u32(_d2, vmask1); - // _d3 = vandq_f32_u32(_d3, vmask1); - // _d4 = vandq_f32_u32(_d4, vmask1); - // _d5 = vandq_f32_u32(_d5, vmask1); - // - // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), - // vget_low_f32(_q1.val[0])); _d1 = - // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2 - // = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - // - // switch (remain_k) { - // case 3: - // vst1q_f32(out_ptr + 12, _d2); - // vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0])); - // case 2: - // vst1q_f32(out_ptr + 6, _d1); - // vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1])); - // case 1: - // vst1q_f32(out_ptr, _d0); - // vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0])); - // default: - // break; - // } - // } - // } - // - // int remain_m = m % 6; - // if (remain_m) { - // int remain_m_start = m - remain_m; - // std::cout << "remain_m_start: " << remain_m_start << std::endl; - // const float *a0 = A + remain_m_start * lda; - // const float *a1 = a0 + lda; - // const float *a2 = a0 + 2 * lda; - // const float *a3 = a0 + 3 * lda; - // const float *a4 = a0 + 4 * lda; - // const float *a5 = a0 + 5 * lda; - // float *out_ptr = output + remain_m_start * k; - // - // uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m)); - // uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), - // vdupq_n_u32(remain_m)); - // - // int loops = k >> 2; - // if (loops > 0) { - // #if __aarch64__ - // for (int l = 0; l < loops; ++l) { - // float32x4_t _d0 = vld1q_f32(a0); - // float32x4_t _d1 = vld1q_f32(a1); - // float32x4_t _d2 = vld1q_f32(a2); - // float32x4_t _d3 = vld1q_f32(a3); - // float32x4_t _d4 = vld1q_f32(a4); - // float32x4_t _d5 = vld1q_f32(a5); - // - // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), - // vget_low_f32(_q1.val[0])); _d1 = - // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); - // _d2 = - // vcombine_f32(vget_high_f32(_q0.val[0]), - // vget_high_f32(_q1.val[0])); - // _d3 = - // vcombine_f32(vget_high_f32(_q0.val[1]), - // vget_high_f32(_q1.val[1])); - // - // _d0 = vandq_f32_u32(_d0, vmask2); - // _d1 = vandq_f32_u32(_d1, vmask2); - // _d2 = vandq_f32_u32(_d2, vmask2); - // _d3 = vandq_f32_u32(_d3, vmask2); - // _d4 = vandq_f32_u32(_q3.val[0], vmask3); - // _d5 = vandq_f32_u32(_q3.val[1], vmask3); - // - // vst1q_f32(out_ptr, _d0); - // vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - // vst1q_f32(out_ptr + 6, _d1); - // vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - // vst1q_f32(out_ptr + 12, _d2); - // vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - // vst1q_f32(out_ptr + 18, _d3); - // vst1_f32(out_ptr + 22, vget_high_f32(_d5)); - // - // a0 += 4; - // a1 += 4; - // a2 += 4; - // a3 += 4; - // a4 += 4; - // a5 += 4; - // out_ptr += 24; - // } - // #else - // asm volatile( - // "loop_4k_%=: \n" - // "vld1.32 {d0-d1}, [%[a0]]! \n" - // "vld1.32 {d2-d3}, [%[a1]]! \n" - // "vld1.32 {d4-d5}, [%[a2]]! \n" - // "vld1.32 {d6-d7}, [%[a3]]! \n" - // "vld1.32 {d8-d9}, [%[a4]]! \n" - // "vld1.32 {d10-d11}, [%[a5]]! \n" - // "vtrn.32 q0, q1 \n" - // "vtrn.32 q2, q3 \n" - // "vtrn.32 q4, q5 \n" - // "vswp.32 d1, d4 \n" - // "vswp.32 d3, d6 \n" - // - // "vbif q0, %q[vzero], %q[vmask2] \n" - // "vbif q1, %q[vzero], %q[vmask2] \n" - // "vbif q2, %q[vzero], %q[vmask2] \n" - // "vbif q3, %q[vzero], %q[vmask2] \n" - // "vbif q4, %q[vzero], %q[vmask3] \n" - // "vbif q5, %q[vzero], %q[vmask3] \n" - // - // "vst1.32 {q0}, [%[out]]! \n" - // "vst1.32 {d8}, [%[out]]! \n" - // "vst1.32 {q1}, [%[out]]! \n" - // "vst1.32 {d10}, [%[out]]! \n" - // "vst1.32 {q2}, [%[out]]! \n" - // "vst1.32 {d9}, [%[out]]! \n" - // "vst1.32 {q3}, [%[out]]! \n" - // "vst1.32 {d11}, [%[out]]! \n" - // - // "subs %[loops], #1 \n" - // "bne loop_4k_%= \n" - // : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] - // "+r"(a2), - // [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops) - // : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero) - // : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5"); - // #endif - // } - // - // if (remain_k > 0) { - // float32x4_t _d0 = vld1q_f32(a0); - // float32x4_t _d1 = vld1q_f32(a1); - // float32x4_t _d2 = vld1q_f32(a2); - // float32x4_t _d3 = vld1q_f32(a3); - // float32x4_t _d4 = vld1q_f32(a4); - // float32x4_t _d5 = vld1q_f32(a5); - // - // _d0 = vandq_f32_u32(_d0, vmask1); - // _d1 = vandq_f32_u32(_d1, vmask1); - // _d2 = vandq_f32_u32(_d2, vmask1); - // _d3 = vandq_f32_u32(_d3, vmask1); - // _d4 = vandq_f32_u32(_d4, vmask1); - // _d5 = vandq_f32_u32(_d5, vmask1); - // - // float32x4x2_t _q0 = vtrnq_f32(_d0, _d1); - // float32x4x2_t _q1 = vtrnq_f32(_d2, _d3); - // float32x4x2_t _q3 = vtrnq_f32(_d4, _d5); - // _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), - // vget_low_f32(_q1.val[0])); _d1 = - // vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2 - // = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0])); - // // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), - // // vget_high_f32(_q1.val[1])); - // - // _d0 = vandq_f32_u32(_d0, vmask2); - // _d1 = vandq_f32_u32(_d1, vmask2); - // _d2 = vandq_f32_u32(_d2, vmask2); - // // _d3 = vandq_f32_u32(_d3, vmask2); - // _d4 = vandq_f32_u32(_q3.val[0], vmask3); - // _d5 = vandq_f32_u32(_q3.val[1], vmask3); - // - // switch (remain_k) { - // case 3: - // vst1q_f32(out_ptr + 12, _d2); - // vst1_f32(out_ptr + 16, vget_high_f32(_d4)); - // case 2: - // vst1q_f32(out_ptr + 6, _d1); - // vst1_f32(out_ptr + 10, vget_low_f32(_d5)); - // case 1: - // vst1q_f32(out_ptr, _d0); - // vst1_f32(out_ptr + 4, vget_low_f32(_d4)); - // default: - // break; - // } - // } - // } } #if __aarch64__ diff --git a/src/operators/math/math_func_neon.h b/src/operators/math/math.h similarity index 100% rename from src/operators/math/math_func_neon.h rename to src/operators/math/math.h diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp index 6b34f522ff6caf32c20971d9cf38f93730fdb727..e066b0cccddf9a43953182788508aca4769fcd27 100644 --- a/src/operators/math/softmax.cpp +++ b/src/operators/math/softmax.cpp @@ -19,7 +19,7 @@ limitations under the License. */ #include #include #include "common/types.h" -#include "operators/math/math_func_neon.h" +#include "operators/math/math.h" namespace paddle_mobile { namespace operators {