提交 1d475a2c 编写于 作者: H hjchen2

update

上级 e0f97f83
......@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_relu);
#endif
#ifdef FUSION_CONVADDADDPRELU_OP
LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
#endif
#ifdef FUSION_CONVADD_OP
LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
LOAD_FUSION_MATCHER(fusion_conv_add);
......@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
#ifdef DROPOUT_OP
LOAD_OP2(dropout, CPU, FPGA);
#endif
#ifdef FUSION_CONVADDPRELU_OP
LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
#endif
#ifdef FUSION_DWCONVBNRELU_OP
LOAD_OP1(fusion_dwconv_bn_relu, CPU);
LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector>
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
#include "operators/conv_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/fusion_conv_add_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
ops::FusionConvAddAddPReluOpMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
#endif
#endif // FUSION_CONVADDADDPRELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
public:
FusionConvAddAddPReluOpMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD,
{{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}},
{G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
std::vector<std::pair<int, std::string>> NeedCheck() {
DLOG << " conv add add prelu check add X ";
return {{2, "Y"}, {2, "X"}};
}
};
template <typename DeviceType, typename T>
class FusionConvAddAddPReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionConvAddAddPReluParam<DeviceType>,
operators::ConvAddAddPReluKernel<DeviceType, T>> {
public:
FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
framework::Scope *scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddAddPReluParam<DeviceType>,
operators::ConvAddAddPReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/fusion_conv_add_bn_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/fusion_conv_add_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/fusion_conv_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
ops::FusionConvAddPReluOpMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
public:
FusionConvAddPReluOpMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
};
template <typename DeviceType, typename T>
class FusionConvAddPReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionConvAddPReluParam<DeviceType>,
operators::ConvAddPReluKernel<DeviceType, T>> {
public:
FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
framework::Scope *scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddPReluParam<DeviceType>,
operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const override;
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP
#include "operators/fusion_conv_add_relu_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBN_OP
#include "operators/fusion_conv_bn_op.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#include "operators/fusion_dwconv_bn_relu_op.h"
#include "operators/math/conv_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddAddPReluKernel<CPU, float>::Init(
FusionConvAddAddPReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddAddPReluKernel<CPU, float>::Compute(
const FusionConvAddAddPReluParam<CPU> &param) {
ConvAddAddPReluCompute<float>(param);
}
template class ConvAddAddPReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace paddle_mobile {
namespace operators {
......@@ -62,34 +63,24 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
const FusionConvAddBNReluParam<CPU> &param) {
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
DepthwiseConv3x3<float, float>(param);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
ConvBNReluBasic<FusionConvAddBNReluParam<CPU>>(param);
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
}
template class ConvAddBNReluKernel<CPU, float>;
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include "operators/kernel/conv_add_kernel.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_add_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace paddle_mobile {
namespace operators {
......@@ -32,34 +32,25 @@ template <>
void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
param.Output());
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
param.Output());
break;
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
ConvAddBasic(param);
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(), param.Output());
}
template class ConvAddKernel<CPU, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
return true;
}
template <>
void ConvAddPReluKernel<CPU, float>::Compute(
const FusionConvAddPReluParam<CPU> &param) {
ConvAddPReluCompute<float>(param);
}
template class ConvAddPReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace paddle_mobile {
namespace operators {
......@@ -32,30 +33,23 @@ void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam<CPU> &param) {
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
DepthwiseConv3x3<float, float>(param);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
break;
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
ConvAddReluBasic<FusionConvAddReluParam<CPU>>(param);
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
}
template class ConvAddReluKernel<CPU, float>;
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace paddle_mobile {
namespace operators {
......@@ -62,34 +63,24 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
const FusionConvBNAddReluParam<CPU> &param) {
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
DepthwiseConv3x3<float, float>(param);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
ConvBNReluBasic<FusionConvBNAddReluParam<CPU>>(param);
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
}
template class ConvBNAddReluKernel<CPU, float>;
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace paddle_mobile {
namespace operators {
......@@ -61,34 +62,24 @@ void ConvBNReluKernel<CPU, float>::Compute(
const FusionConvBNReluParam<CPU> &param) {
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
DepthwiseConv3x3<float, float>(param);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
ConvBNReluBasic<FusionConvBNReluParam<CPU>>(param);
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
}
template class ConvBNReluKernel<CPU, float>;
......
......@@ -32,10 +32,10 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
template <>
void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
switch (param.ExecMode()) {
#ifndef __aarch64__
case ConvParam<CPU>::EXEC_GEMM_INT8:
GemmConv<int8_t, int32_t>(param);
break;
#ifndef __aarch64__
case ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8:
DepthwiseConv3x3<int8_t, int32_t>(param);
break;
......@@ -44,12 +44,8 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
break;
#endif // __aarch64__
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
DepthwiseConv3x3<float, float>(param);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace paddle_mobile {
namespace operators {
......@@ -61,37 +62,28 @@ void DWConvBNReluKernel<CPU, float>::Compute(
const FusionDWConvBNReluParam<CPU> &param) {
switch (param.ExecMode()) {
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
param.Paddings(), param.Output());
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
DepthwiseConv3x3<float, float>(param);
break;
#ifndef __aarch64__
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
DepthwiseConv5x5<float, float>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param);
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
break;
#endif // __aarch64__
case ConvParam<CPU>::EXEC_GEMM_FLOAT:
ConvBNReluBasic<FusionDWConvBNReluParam<CPU>>(param);
GemmConv<float, float>(param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
param.NewBias(), param.Output());
}
template class DWConvBNReluKernel<CPU, float>;
} // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
Tensor bias1 = *param.Bias1();
Tensor *output = param.Output();
output->mutable_data<float>();
float *biase_data = bias.data<float>();
int axis = param.Axis();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
Tensor aa = *param.InputAlpha();
float *p = aa.data<float>();
std::string mode = param.Mode();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor bias1_batch = bias1.Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
float *biase_data1 = bias1_slice.data<float>();
math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
p, mode, biase_data, biase_data1);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDADDPRELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
Tensor *output = param.Output();
output->mutable_data<float>();
float *biase_data = bias.data<float>();
int axis = param.Axis();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
Tensor aa = *param.InputAlpha();
float *p = aa.data<float>();
std::string mode = param.Mode();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
p, mode, biase_data, nullptr);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_CONVADDPRELU_OP
......@@ -12,38 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#pragma once
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides, const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
template <typename Itype, typename Otype>
void GemmConv(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
Tensor *output = param.Output();
output->mutable_data<float>();
float *biase_data = bias.data<float>();
output->mutable_data<Otype>();
int axis = param.Axis();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
const std::vector<int> strides = param.Strides();
const std::vector<int> paddings = param.Paddings();
const std::vector<int> dilations = param.Dilations();
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
......@@ -57,12 +73,11 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col.mutable_data<Itype>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
......@@ -81,9 +96,10 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
math::Vol2ColFunctor<CPU, Itype> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
const int batch_size = static_cast<int>(input->dims()[0]);
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
......@@ -92,8 +108,8 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
// col_matrix.ShareDataWith(in_slice);
col_matrix = in_slice;
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
......@@ -105,17 +121,122 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::MatMul<float, float>(filter_slice, false, col_matrix, false,
math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), false, biase_data);
static_cast<float>(0), false,
static_cast<Otype *>(nullptr));
}
}
}
} // namespace operators
} // namespace paddle_mobile
template <int tile, int kernel>
void WinogradConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
const Tensor *filter = param.transformed_filter_;
Tensor *output = param.Output();
output->mutable_data<float>();
int batch_size = input->dims()[0];
int groups = param.Groups();
const std::vector<int> &paddings = param.Paddings();
auto winograd_pad = [&](int width, int pad) {
int output_tile = tile - kernel + 1;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
return pad_width + tile - width;
};
math::PadFunctor<CPU, float> pad;
Tensor input_pad;
framework::Tensor transformed_input;
for (int i = 0; i < batch_size; ++i) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
// int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
// int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
int pad_bottom = paddings[0];
int pad_right = paddings[1];
if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
framework::DDim pad_shape = in_batch.dims();
pad_shape[2] += paddings[0] + pad_bottom;
pad_shape[3] += paddings[1] + pad_right;
input_pad.mutable_data<float>(pad_shape);
pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
&input_pad);
} else {
input_pad = in_batch;
}
// tile input and transform
math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
// caculate output
math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
output);
}
}
template <typename Itype, typename Otype>
void DepthwiseConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
const Tensor *filter = param.Filter();
const std::vector<int> &paddings = param.Paddings();
const std::vector<int> &strides = param.Strides();
const int batch_size = input->dims()[0];
Tensor *output = param.Output();
output->mutable_data<Otype>();
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
if (strides[0] == 1) {
math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
&out_batch);
} else if (strides[0] == 2) {
math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
&out_batch);
} else {
GemmConv<Itype, Otype>(param);
}
}
}
template <typename Itype, typename Otype>
void DepthwiseConv5x5(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
const Tensor *filter = param.Filter();
const std::vector<int> &paddings = param.Paddings();
const std::vector<int> &strides = param.Strides();
const int batch_size = input->dims()[0];
Tensor *output = param.Output();
output->mutable_data<Otype>();
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv<Itype, Otype>(param);
// }
}
template void GemmConv<float, float>(const ConvParam<CPU> &param);
template void WinogradConv3x3<8, 3>(const ConvParam<CPU> &param);
template void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param);
template void DepthwiseConv5x5<float, float>(const ConvParam<CPU> &param);
#ifndef __aarch64__
template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -15,386 +15,31 @@ limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename Itype, typename Otype>
inline void GemmConv(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<Otype>();
int groups = param.Groups();
const std::vector<int> strides = param.Strides();
const std::vector<int> paddings = param.Paddings();
const std::vector<int> dilations = param.Dilations();
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<Itype>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, Itype> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
const int batch_size = static_cast<int>(input->dims()[0]);
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
// col_matrix.ShareDataWith(in_slice);
col_matrix = in_slice;
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
int stride);
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides, const std::vector<int> &paddings,
const std::vector<int> &dilations);
math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0), false,
static_cast<Otype *>(nullptr));
}
}
}
template <typename Itype, typename Otype>
void GemmConv(const ConvParam<CPU> &param);
template <int tile, int kernel>
inline void WinogradConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
const Tensor *filter = param.transformed_filter_;
Tensor *output = param.Output();
output->mutable_data<float>();
int batch_size = input->dims()[0];
int groups = param.Groups();
const std::vector<int> &paddings = param.Paddings();
auto winograd_pad = [&](int width, int pad) {
int output_tile = tile - kernel + 1;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
return pad_width + tile - width;
};
void WinogradConv3x3(const ConvParam<CPU> &param);
math::PadFunctor<CPU, float> pad;
Tensor input_pad;
framework::Tensor transformed_input;
for (int i = 0; i < batch_size; ++i) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
// int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
// int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
int pad_bottom = paddings[0];
int pad_right = paddings[1];
if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
framework::DDim pad_shape = in_batch.dims();
pad_shape[2] += paddings[0] + pad_bottom;
pad_shape[3] += paddings[1] + pad_right;
input_pad.mutable_data<float>(pad_shape);
pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
&input_pad);
} else {
input_pad = in_batch;
}
// tile input and transform
math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
// caculate output
math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
output);
}
}
#ifndef __aarch64__
// int8 DepthwiseConv3x3
template <typename Itype, typename Otype>
inline void DepthwiseConv3x3(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
const Tensor *filter = param.Filter();
const std::vector<int> &paddings = param.Paddings();
const std::vector<int> &strides = param.Strides();
const int batch_size = input->dims()[0];
Tensor *output = param.Output();
output->mutable_data<Otype>();
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
if (strides[0] == 1) {
math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
&out_batch);
} else if (strides[0] == 2) {
math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
&out_batch);
} else {
GemmConv<Itype, Otype>(param);
}
}
}
#endif // __aarch64__
void DepthwiseConv3x3(const ConvParam<CPU> &param);
template <typename Itype, typename Otype>
inline void DepthwiseConv5x5(const ConvParam<CPU> &param) {
const Tensor *input = param.Input();
const Tensor *filter = param.Filter();
const std::vector<int> &paddings = param.Paddings();
const std::vector<int> &strides = param.Strides();
const int batch_size = input->dims()[0];
Tensor *output = param.Output();
output->mutable_data<Otype>();
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv<Itype, Otype>(param);
// }
}
template <typename ParamType>
void ConvAddReluBasic(const ParamType &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
Tensor *output = param.Output();
output->mutable_data<float>();
float alpha = 1.0f;
float beta = 1.0f;
int32_t groups = param.Groups();
int32_t axis = param.Axis();
std::vector<int32_t> strides = param.Strides();
std::vector<int32_t> paddings = param.Paddings();
std::vector<int32_t> dilations = param.Dilations();
const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int32_t>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
float *bias_data = bias.data<float>();
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int32_t i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int32_t g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col_matrix = in_slice;
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::MatMul<float, float>(filter_slice, false, col_matrix, false, alpha,
&out_slice, beta, true, bias_data);
}
}
}
template <typename ParamType>
void ConvBNReluBasic(const ParamType &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col_matrix = in_slice;
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::MatMulWithBn(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0), true, &new_scale, &new_bias, g);
}
}
}
void DepthwiseConv5x5(const ConvParam<CPU> &param);
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
Tensor *bias1 = param.Bias();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
math::MatMulWithBn(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true, &new_scale, &new_bias, g,
bias_data.data<float>());
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvAddAddPReluKernel
: public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
public:
void Compute(const FusionConvAddAddPReluParam<DeviceType> &param);
bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -23,7 +23,6 @@ limitations under the License. */
#include "common/common.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvAddPReluKernel
: public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
public:
void Compute(const FusionConvAddPReluParam<DeviceType> &param);
bool Init(FusionConvAddPReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
......
......@@ -15,24 +15,21 @@ limitations under the License. */
#pragma once
#ifdef LRN_OP
#include <cmath>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
#include <cmath>
#ifdef __ARM_NEON
#include "arm_neon.h"
#include "operators/math/math_func_neon.h"
#include <arm_neon.h>
#include "operators/math/math.h"
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using namespace framework;
template <typename T>
struct LRNFunctor {
void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include "operators/math/math_func_neon.h"
#include "operators/math/math.h"
#endif
namespace paddle_mobile {
......
......@@ -14,91 +14,16 @@ limitations under the License. */
#pragma once
#include <vector>
#include "framework/tensor.h"
#include "operators/math/activation.h"
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include "framework/ddim.h"
#include "framework/tensor.h"
#include "operators/math/activation.h"
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::DDim;
using framework::Tensor;
inline int ConvOutputSize(int input_size, int filter_size, int dilation,
int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) { // NOLINT
const auto bias_ptr = bias.data<float>();
const DDim bias_ddim = bias.dims();
PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
"the bias tensor's dims size != 1")
DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
int outer_size = paddle_mobile::framework::product(outer_ddim);
int inner_size = paddle_mobile::framework::product(inner_ddim);
bias.Resize(dDim);
auto new_ptr = bias.mutable_data<float>();
int axis_size = dDim[axis];
#ifdef __ARM_NEON
for (int i = 0; i < outer_size; ++i) {
int inner_num = inner_size >> 4;
int remain = inner_size - (inner_num << 4);
float v_bias = bias_ptr[i * axis_size / outer_size];
for (; inner_num > 0; inner_num--) {
float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
vst1q_f32(new_ptr, v_newptr1);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr2);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr3);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr4);
new_ptr += 4;
}
for (; remain > 0; remain--) {
*new_ptr = v_bias;
new_ptr++;
}
}
#else
for (int i = 0; i < outer_size; ++i) {
float v_bias = bias_ptr[i * axis_size / outer_size];
for (int j = 0; j < inner_size; ++j) {
new_ptr[i * inner_size + j] = v_bias;
}
}
#endif
}
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides,
const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
template <ActivationType Act>
void AddChannelWise(const framework::Tensor *input,
const framework::Tensor *bias, framework::Tensor *output) {
......
......@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) && !defined(__aarch64__)
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include "operators/math/depthwise_conv3x3.h"
......@@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
// middle
int remain_start = valid_w_start;
#ifdef __ARM_NEON__
int output_tiles = (valid_w_end - valid_w_start) / 6;
remain_start = valid_w_start + output_tiles * 6;
int32x4_t _sum0, _sum1;
......@@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
vst1q_s32(output_ptr + output_offset, _sum0);
vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1));
}
#endif // __ARM_NEON__
for (int w = remain_start; w < valid_w_end; ++w) {
int32_t value = 0;
int input_start = -padding_w + w * Stride_w;
......@@ -215,6 +213,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr2 += valid_w_start;
output_ptr3 += valid_w_start;
}
#if __aarch64__
#else
// valid
int loop = output_w_tiles;
asm volatile(
......@@ -525,6 +525,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
: [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
: "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15", "r0");
#endif // __aarch64__
// pad right
if (padding_w) {
int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
......@@ -618,7 +619,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr0 += valid_w_start;
output_ptr1 += valid_w_start;
}
// valid
// valid
#if __aarch64__
#else
int loop = output_w_tiles;
asm volatile(
"cmp %[loop], #0 \n"
......@@ -804,6 +807,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
: [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
: "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15", "r0");
#endif // __aarch64__
// pad right
if (padding_w) {
int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
......@@ -869,7 +873,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
}
output_ptr0 += valid_w_start;
}
// valid
// valid
#if __aarch64__
#else
int loop = output_w_tiles;
asm volatile(
"cmp %[loop], #0 \n"
......@@ -993,6 +999,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
: [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
: "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15", "r0");
#endif // __aarch64__
// pad right
if (padding_w) {
int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
......@@ -1152,7 +1159,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
output_ptr1 += valid_w_start;
output_ptr2 += valid_w_start;
}
// valid
// valid
#if __aarch64__
#else
int loop = output_w_tiles;
asm volatile(
"cmp %[loop], #0 \n"
......@@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
: [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
: "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15", "r0");
#endif // __aarch64__
// pad right
if (padding_w > 0) {
int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
......@@ -1490,7 +1500,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
input_ptr2 += valid_input_w_start;
output_ptr0 += valid_w_start;
}
// valid
// valid
#if __aarch64__
#else
int loop = output_w_tiles;
asm volatile(
"cmp %[loop], #0 \n"
......@@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
: [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
: "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15", "r0");
#endif // __aarch64__
// pad right
if (padding_w > 0) {
int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
......@@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
} // namespace operators
} // namespace paddle_mobile
#endif
#endif // __ARM_NEON__
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) && defined(__aarch64__)
#include "operators/math/depthwise_conv3x3.h"
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
namespace math {
// template<>
// void DepthwiseConv3x3<int8_t, int32_t>(
// const framework::Tensor *input, const framework::Tensor *filter,
// const std::vector<int> &strides, framework::Tensor *output) {
// PADDLE_MOBILE_THROW_EXCEPTION(
// "Depthwise conv with generic strides has not been implemented.");
// }
template <>
void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
const framework::Tensor &filter,
const std::vector<int> &paddings,
framework::Tensor *output) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Depthwise conv3x3 with stride 1 for arm v8 has not been implemented.");
}
template <>
void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
const framework::Tensor &filter,
const std::vector<int> &paddings,
framework::Tensor *output) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Depthwise conv3x3 with stride 2 for arm v8 has not been implemented.");
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
......
......@@ -31,345 +31,239 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
void pack_lhs_6r(const int m, const int k, const float *A, const int lda,
float *output, const bool unroll) {
float *zero = new float[k];
memset(zero, 0, k * sizeof(float));
uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
int remain_k = k & 0x3;
uint32x4_t vzero = vdupq_n_u32(0);
uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
const int m_tail = m % 6;
const int i_length = m - m_tail;
for (int i = 0; i < i_length; i += 6) {
#pragma omp parallel for if (unroll)
for (int i = 0; i < m - 5; i += 6) {
const float *a0 = A + i * lda;
const float *a1 = A + (i + 1) * lda;
const float *a2 = A + (i + 2) * lda;
const float *a3 = A + (i + 3) * lda;
const float *a4 = A + (i + 4) * lda;
const float *a5 = A + (i + 5) * lda;
float *local_buffer = output + i * k;
for (int j = 0; j < k; ++j) {
*local_buffer++ = *a0++;
*local_buffer++ = *a1++;
*local_buffer++ = *a2++;
*local_buffer++ = *a3++;
*local_buffer++ = *a4++;
*local_buffer++ = *a5++;
float *out_ptr = output + i * k;
int loops = k >> 2;
if (loops > 0) {
#if __aarch64__
for (int l = 0; l < loops; ++l) {
float32x4_t _d0 = vld1q_f32(a0);
float32x4_t _d1 = vld1q_f32(a1);
float32x4_t _d2 = vld1q_f32(a2);
float32x4_t _d3 = vld1q_f32(a3);
float32x4_t _d4 = vld1q_f32(a4);
float32x4_t _d5 = vld1q_f32(a5);
float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
_d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
_d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
_d2 =
vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
_d3 =
vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
vst1q_f32(out_ptr, _d0);
vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
vst1q_f32(out_ptr + 6, _d1);
vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
vst1q_f32(out_ptr + 12, _d2);
vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
vst1q_f32(out_ptr + 18, _d3);
vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
a0 += 4;
a1 += 4;
a2 += 4;
a3 += 4;
a4 += 4;
a5 += 4;
out_ptr += 24;
}
#else
asm volatile(
"loop_4k_%=: \n"
"vld1.32 {d0-d1}, [%[a0]]! \n"
"vld1.32 {d2-d3}, [%[a1]]! \n"
"vld1.32 {d4-d5}, [%[a2]]! \n"
"vld1.32 {d6-d7}, [%[a3]]! \n"
"vld1.32 {d8-d9}, [%[a4]]! \n"
"vld1.32 {d10-d11}, [%[a5]]! \n"
"vtrn.32 q0, q1 \n"
"vtrn.32 q2, q3 \n"
"vtrn.32 q4, q5 \n"
"vswp.32 d1, d4 \n"
"vswp.32 d3, d6 \n"
"vst1.32 {q0}, [%[out]]! \n"
"vst1.32 {d8}, [%[out]]! \n"
"vst1.32 {q1}, [%[out]]! \n"
"vst1.32 {d10}, [%[out]]! \n"
"vst1.32 {q2}, [%[out]]! \n"
"vst1.32 {d9}, [%[out]]! \n"
"vst1.32 {q3}, [%[out]]! \n"
"vst1.32 {d11}, [%[out]]! \n"
"subs %[loops], #1 \n"
"bne loop_4k_%= \n"
: [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
[a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
#endif
}
if (remain_k > 0) {
float32x4_t _d0 = vld1q_f32(a0);
float32x4_t _d1 = vld1q_f32(a1);
float32x4_t _d2 = vld1q_f32(a2);
float32x4_t _d3 = vld1q_f32(a3);
float32x4_t _d4 = vld1q_f32(a4);
float32x4_t _d5 = vld1q_f32(a5);
_d0 = vandq_f32_u32(_d0, vmask1);
_d1 = vandq_f32_u32(_d1, vmask1);
_d2 = vandq_f32_u32(_d2, vmask1);
_d3 = vandq_f32_u32(_d3, vmask1);
_d4 = vandq_f32_u32(_d4, vmask1);
_d5 = vandq_f32_u32(_d5, vmask1);
float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
_d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
_d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
_d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
switch (remain_k) {
case 3:
vst1q_f32(out_ptr + 12, _d2);
vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
case 2:
vst1q_f32(out_ptr + 6, _d1);
vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
case 1:
vst1q_f32(out_ptr, _d0);
vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
default:
break;
}
}
}
if (m_tail != 0) {
const float *a0 = A + i_length * lda;
int remain_m = m % 6;
if (remain_m) {
int remain_m_start = m - remain_m;
const float *a0 = A + remain_m_start * lda;
const float *a1 = a0 + lda;
const float *a2 = a0 + 2 * lda;
const float *a3 = a0 + 3 * lda;
const float *a4 = a0 + 4 * lda;
const float *a5 = a0 + 5 * lda;
float *local_buffer = output + i_length * k;
switch (m_tail) {
case 1:
a1 = zero;
case 2:
a2 = zero;
case 3:
a3 = zero;
case 4:
a4 = zero;
case 5:
a5 = zero;
break;
default:
break;
float *out_ptr = output + remain_m_start * k;
uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m));
const float zerobuff[4] = {0.f, 0.f, 0.f, 0.f};
int lk = 0;
for (; lk < k - 3; lk += 4) {
switch (remain_m) {
case 1:
a1 = zerobuff;
case 2:
a2 = zerobuff;
case 3:
a3 = zerobuff;
case 4:
a4 = zerobuff;
case 5:
a5 = zerobuff;
default:
break;
}
#if __aarch64__
float32x4_t _d0 = vld1q_f32(a0);
float32x4_t _d1 = vld1q_f32(a1);
float32x4_t _d2 = vld1q_f32(a2);
float32x4_t _d3 = vld1q_f32(a3);
float32x4_t _d4 = vld1q_f32(a4);
float32x4_t _d5 = vld1q_f32(a5);
float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
_d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
_d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
_d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
_d3 = vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
_d0 = vandq_f32_u32(_d0, vmask2);
_d1 = vandq_f32_u32(_d1, vmask2);
_d2 = vandq_f32_u32(_d2, vmask2);
_d3 = vandq_f32_u32(_d3, vmask2);
_d4 = vandq_f32_u32(_q3.val[0], vmask3);
_d5 = vandq_f32_u32(_q3.val[1], vmask3);
vst1q_f32(out_ptr, _d0);
vst1_f32(out_ptr + 4, vget_low_f32(_d4));
vst1q_f32(out_ptr + 6, _d1);
vst1_f32(out_ptr + 10, vget_low_f32(_d5));
vst1q_f32(out_ptr + 12, _d2);
vst1_f32(out_ptr + 16, vget_high_f32(_d4));
vst1q_f32(out_ptr + 18, _d3);
vst1_f32(out_ptr + 22, vget_high_f32(_d5));
out_ptr += 24;
#else
asm volatile(
"vld1.32 {d0-d1}, [%[a0]] \n"
"vld1.32 {d2-d3}, [%[a1]] \n"
"vld1.32 {d4-d5}, [%[a2]] \n"
"vld1.32 {d6-d7}, [%[a3]] \n"
"vld1.32 {d8-d9}, [%[a4]] \n"
"vld1.32 {d10-d11}, [%[a5]] \n"
"vtrn.32 q0, q1 \n"
"vtrn.32 q2, q3 \n"
"vtrn.32 q4, q5 \n"
"vswp.32 d1, d4 \n"
"vswp.32 d3, d6 \n"
"vbif q0, %q[vzero], %q[vmask2] \n"
"vbif q1, %q[vzero], %q[vmask2] \n"
"vbif q2, %q[vzero], %q[vmask2] \n"
"vbif q3, %q[vzero], %q[vmask2] \n"
"vbif q4, %q[vzero], %q[vmask3] \n"
"vbif q5, %q[vzero], %q[vmask3] \n"
"vst1.32 {q0}, [%[out]]! \n"
"vst1.32 {d8}, [%[out]]! \n"
"vst1.32 {q1}, [%[out]]! \n"
"vst1.32 {d10}, [%[out]]! \n"
"vst1.32 {q2}, [%[out]]! \n"
"vst1.32 {d9}, [%[out]]! \n"
"vst1.32 {q3}, [%[out]]! \n"
"vst1.32 {d11}, [%[out]]! \n"
: [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
[a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5)
: [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
#endif
}
for (int j = 0; j < k; ++j) {
*local_buffer++ = *a0++;
*local_buffer++ = *a1++;
*local_buffer++ = *a2++;
*local_buffer++ = *a3++;
*local_buffer++ = *a4++;
*local_buffer++ = *a5++;
// remain k
for (; lk < k; ++lk) {
*out_ptr++ = *a0++;
*out_ptr++ = *a1++;
*out_ptr++ = *a2++;
*out_ptr++ = *a3++;
*out_ptr++ = *a4++;
*out_ptr++ = *a5++;
}
delete[] zero;
}
// uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
// int remain_k = k & 0x3;
// uint32x4_t vzero = vdupq_n_u32(0);
// uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
//
// std::cout << "m: " << m << ", k: " << k << std::endl;
// #pragma omp parallel for if (unroll)
// for (int i = 0; i < m - 5; i += 6) {
// std::cout << "i: " << i << std::endl;
// const float *a0 = A + i * lda;
// const float *a1 = A + (i + 1) * lda;
// const float *a2 = A + (i + 2) * lda;
// const float *a3 = A + (i + 3) * lda;
// const float *a4 = A + (i + 4) * lda;
// const float *a5 = A + (i + 5) * lda;
// float *out_ptr = output + i * k;
//
// int loops = k >> 2;
// if (loops > 0) {
// #if __aarch64__
// for (int l = 0; l < loops; ++l) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
// _d2 =
// vcombine_f32(vget_high_f32(_q0.val[0]),
// vget_high_f32(_q1.val[0]));
// _d3 =
// vcombine_f32(vget_high_f32(_q0.val[1]),
// vget_high_f32(_q1.val[1]));
//
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
// vst1q_f32(out_ptr + 18, _d3);
// vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
//
// a0 += 4;
// a1 += 4;
// a2 += 4;
// a3 += 4;
// a4 += 4;
// a5 += 4;
// out_ptr += 24;
// }
// #else
// asm volatile(
// "loop_4k_%=: \n"
// "vld1.32 {d0-d1}, [%[a0]]! \n"
// "vld1.32 {d2-d3}, [%[a1]]! \n"
// "vld1.32 {d4-d5}, [%[a2]]! \n"
// "vld1.32 {d6-d7}, [%[a3]]! \n"
// "vld1.32 {d8-d9}, [%[a4]]! \n"
// "vld1.32 {d10-d11}, [%[a5]]! \n"
// "vtrn.32 q0, q1 \n"
// "vtrn.32 q2, q3 \n"
// "vtrn.32 q4, q5 \n"
// "vswp.32 d1, d4 \n"
// "vswp.32 d3, d6 \n"
//
// "vst1.32 {q0}, [%[out]]! \n"
// "vst1.32 {d8}, [%[out]]! \n"
// "vst1.32 {q1}, [%[out]]! \n"
// "vst1.32 {d10}, [%[out]]! \n"
// "vst1.32 {q2}, [%[out]]! \n"
// "vst1.32 {d9}, [%[out]]! \n"
// "vst1.32 {q3}, [%[out]]! \n"
// "vst1.32 {d11}, [%[out]]! \n"
//
// "subs %[loops], #1 \n"
// "bne loop_4k_%= \n"
// : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
// "+r"(a2),
// [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
// :
// : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
// #endif
// }
//
// if (remain_k > 0) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// _d0 = vandq_f32_u32(_d0, vmask1);
// _d1 = vandq_f32_u32(_d1, vmask1);
// _d2 = vandq_f32_u32(_d2, vmask1);
// _d3 = vandq_f32_u32(_d3, vmask1);
// _d4 = vandq_f32_u32(_d4, vmask1);
// _d5 = vandq_f32_u32(_d5, vmask1);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
// = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
//
// switch (remain_k) {
// case 3:
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
// case 2:
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
// case 1:
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
// default:
// break;
// }
// }
// }
//
// int remain_m = m % 6;
// if (remain_m) {
// int remain_m_start = m - remain_m;
// std::cout << "remain_m_start: " << remain_m_start << std::endl;
// const float *a0 = A + remain_m_start * lda;
// const float *a1 = a0 + lda;
// const float *a2 = a0 + 2 * lda;
// const float *a3 = a0 + 3 * lda;
// const float *a4 = a0 + 4 * lda;
// const float *a5 = a0 + 5 * lda;
// float *out_ptr = output + remain_m_start * k;
//
// uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
// uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4),
// vdupq_n_u32(remain_m));
//
// int loops = k >> 2;
// if (loops > 0) {
// #if __aarch64__
// for (int l = 0; l < loops; ++l) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
// _d2 =
// vcombine_f32(vget_high_f32(_q0.val[0]),
// vget_high_f32(_q1.val[0]));
// _d3 =
// vcombine_f32(vget_high_f32(_q0.val[1]),
// vget_high_f32(_q1.val[1]));
//
// _d0 = vandq_f32_u32(_d0, vmask2);
// _d1 = vandq_f32_u32(_d1, vmask2);
// _d2 = vandq_f32_u32(_d2, vmask2);
// _d3 = vandq_f32_u32(_d3, vmask2);
// _d4 = vandq_f32_u32(_q3.val[0], vmask3);
// _d5 = vandq_f32_u32(_q3.val[1], vmask3);
//
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_d4));
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_d5));
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_d4));
// vst1q_f32(out_ptr + 18, _d3);
// vst1_f32(out_ptr + 22, vget_high_f32(_d5));
//
// a0 += 4;
// a1 += 4;
// a2 += 4;
// a3 += 4;
// a4 += 4;
// a5 += 4;
// out_ptr += 24;
// }
// #else
// asm volatile(
// "loop_4k_%=: \n"
// "vld1.32 {d0-d1}, [%[a0]]! \n"
// "vld1.32 {d2-d3}, [%[a1]]! \n"
// "vld1.32 {d4-d5}, [%[a2]]! \n"
// "vld1.32 {d6-d7}, [%[a3]]! \n"
// "vld1.32 {d8-d9}, [%[a4]]! \n"
// "vld1.32 {d10-d11}, [%[a5]]! \n"
// "vtrn.32 q0, q1 \n"
// "vtrn.32 q2, q3 \n"
// "vtrn.32 q4, q5 \n"
// "vswp.32 d1, d4 \n"
// "vswp.32 d3, d6 \n"
//
// "vbif q0, %q[vzero], %q[vmask2] \n"
// "vbif q1, %q[vzero], %q[vmask2] \n"
// "vbif q2, %q[vzero], %q[vmask2] \n"
// "vbif q3, %q[vzero], %q[vmask2] \n"
// "vbif q4, %q[vzero], %q[vmask3] \n"
// "vbif q5, %q[vzero], %q[vmask3] \n"
//
// "vst1.32 {q0}, [%[out]]! \n"
// "vst1.32 {d8}, [%[out]]! \n"
// "vst1.32 {q1}, [%[out]]! \n"
// "vst1.32 {d10}, [%[out]]! \n"
// "vst1.32 {q2}, [%[out]]! \n"
// "vst1.32 {d9}, [%[out]]! \n"
// "vst1.32 {q3}, [%[out]]! \n"
// "vst1.32 {d11}, [%[out]]! \n"
//
// "subs %[loops], #1 \n"
// "bne loop_4k_%= \n"
// : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
// "+r"(a2),
// [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
// : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
// : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
// #endif
// }
//
// if (remain_k > 0) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// _d0 = vandq_f32_u32(_d0, vmask1);
// _d1 = vandq_f32_u32(_d1, vmask1);
// _d2 = vandq_f32_u32(_d2, vmask1);
// _d3 = vandq_f32_u32(_d3, vmask1);
// _d4 = vandq_f32_u32(_d4, vmask1);
// _d5 = vandq_f32_u32(_d5, vmask1);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
// = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
// // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]),
// // vget_high_f32(_q1.val[1]));
//
// _d0 = vandq_f32_u32(_d0, vmask2);
// _d1 = vandq_f32_u32(_d1, vmask2);
// _d2 = vandq_f32_u32(_d2, vmask2);
// // _d3 = vandq_f32_u32(_d3, vmask2);
// _d4 = vandq_f32_u32(_q3.val[0], vmask3);
// _d5 = vandq_f32_u32(_q3.val[1], vmask3);
//
// switch (remain_k) {
// case 3:
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_d4));
// case 2:
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_d5));
// case 1:
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_d4));
// default:
// break;
// }
// }
// }
}
#if __aarch64__
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include <algorithm>
#include <limits>
#include "common/types.h"
#include "operators/math/math_func_neon.h"
#include "operators/math/math.h"
namespace paddle_mobile {
namespace operators {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册