提交 d4c0395f 编写于 作者: L liuruilong

add conv_add_relu op

上级 e691f022
......@@ -12,9 +12,13 @@ option(FPGA "fpga" OFF)
if (CPU)
add_definitions(-DPADDLE_MOBILE_CPU)
elseif (MALI_GPU)
endif()
if (MALI_GPU)
add_definitions(-DPADDLE_MOBILE_MALI_GPU)
elseif(FPGA)
endif()
if(FPGA)
add_definitions(-DPADDLE_MOBILE_FPGA)
endif()
......@@ -94,6 +98,7 @@ if (googlenet)
add_definitions(-DPOOL_OP)
add_definitions(-DRELU_OP)
add_definitions(-DFUSION_CONVADD_OP)
add_definitions(-DFUSION_CONVADD_RELU_OP)
elseif (mobilenet)
add_definitions(-DCONV_OP)
add_definitions(-DELEMENTWISEADD_OP)
......@@ -145,6 +150,7 @@ else ()
add_definitions(-DSIGMOID_OP)
add_definitions(-DSOFTMAX_OP)
add_definitions(-DTRANSPOSE_OP)
add_definitions(-DFUSION_CONVADD_RELU_OP)
endif()
if (IS_IOS)
......
......@@ -63,6 +63,7 @@ std::unordered_map<
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}};
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};
} // namespace paddle_mobile
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "operators/conv_op.h"
#include <vector>
#include "framework/op_proto_maker.h"
#include "operators/math/conv_func.h"
#include "framework/op_registry.h"
namespace paddle_mobile {
......@@ -38,7 +39,7 @@ void ConvOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
......
......@@ -43,13 +43,6 @@ class ConvOp
private:
};
inline int ConvOutputSize(int input_size, int filter_size, int dilation,
int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "operators/depthwise_conv_op.h"
#include <vector>
#include "framework/op_proto_maker.h"
#include "operators/math/conv_func.h"
#include "framework/op_registry.h"
#include "operators/conv_op.h"
......@@ -39,7 +40,7 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/math/conv_func.h"
#include "operators/fusion_conv_add.h"
namespace paddle_mobile {
......@@ -35,7 +36,7 @@ void FushionConvAddOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
......
......@@ -18,10 +18,10 @@ limitations under the License. */
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "framework/operator.h"
#include "operators/kernel/conv_add_kernel.h"
#include "framework/program/program-optimize/fusion_op_register.h"
namespace paddle_mobile {
namespace operators {
......@@ -67,13 +67,6 @@ class FushionConvAddOp : public framework::OperatorWithKernel<
protected:
};
inline int ConvOutputSize(int input_size, int filter_size, int dilation,
int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
#ifdef PADDLE_MOBILE_CPU
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
......
......@@ -15,5 +15,46 @@ limitations under the License. */
#ifdef CONVADDRELU_OP
#include "fusion_conv_add_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
}
}
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_relu);
REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
......@@ -17,6 +17,8 @@ limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "framework/program/program-optimize/fusion_op_register.h"
namespace paddle_mobile {
......@@ -33,22 +35,34 @@ class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
{{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
};
class ConvAddReluOp {
template <typename DeviceType, typename T>
class FusionConvAddReluOp: public framework::OperatorWithKernel<
DeviceType, FushionConvAddReluParam,
operators::ConvAddReluKernel<DeviceType, T>> {
public:
private:
FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FushionConvAddReluParam,
operators::ConvAddReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FushionConvAddReluParam,
operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
// new FushionConvAddReluOpMatcher());
//static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new FushionConvAddReluOpMatcher());
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
......
......@@ -70,7 +70,7 @@ class FushionFcOp : public framework::OperatorWithKernel<
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
//static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
......
......@@ -26,7 +26,7 @@ void ConvAddKernel<CPU, float>::Compute(
Tensor bias = *param.Bias();
int axis = param.Axis();
Tensor *output = param.Output();
expand_bias(bias, axis, output->dims());
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
......@@ -50,7 +50,7 @@ void ConvAddKernel<CPU, float>::Compute(
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
bool is_expand = math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#include "operators/kernel/conv_add_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
void ConvAddReluKernel<CPU, float>::Compute(
const FushionConvAddReluParam &param) const {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand = math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true);
}
}
}
template class ConvAddReluKernel<CPU, float>;
}
}
#endif
......@@ -21,11 +21,12 @@ limitations under the License. */
#include <arm_neon.h>
#endif
#include "framework/ddim.h"
#include "operators/op_param.h"
#include "framework/operator.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
#include "operators/math/conv_func.h"
#include "operators/math/math_function.h"
namespace paddle_mobile {
namespace operators {
......@@ -33,75 +34,13 @@ namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvAddKernel : public OpKernelBase<DeviceType, FushionConvAddParam> {
public:
void Compute(const FushionConvAddParam &param) const;
};
inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
auto bias_ptr = bias.data<float>();
const DDim bias_ddim = bias.dims();
PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
"the bias tensor's dims size != 1")
DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
int outer_size = paddle_mobile::framework::product(outer_ddim);
int inner_size = paddle_mobile::framework::product(inner_ddim);
bias.Resize(dDim);
auto new_ptr = bias.mutable_data<float>();
int axis_size = dDim[axis];
#if __ARM_NEON
for (int i = 0; i < outer_size; ++i) {
int inner_num = inner_size >> 4;
int remain = inner_size - (inner_num << 4);
float v_bias = bias_ptr[i * axis_size / outer_size];
for (; inner_num > 0; inner_num--) {
float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
vst1q_f32(new_ptr, v_newptr1);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr2);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr3);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr4);
new_ptr += 4;
}
for (; remain > 0; remain--) {
*new_ptr = v_bias;
new_ptr++;
}
}
#else
for (int i = 0; i < outer_size; ++i) {
float v_bias = bias_ptr[i * axis_size / outer_size];
for (int j = 0; j < inner_size; ++j) {
new_ptr[i * inner_size + j] = v_bias;
}
}
#endif
}
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides,
const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
} // namespace operators
} // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADD_RELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/op_param.h"
#include "operators/math/im2col.h"
#include "operators/math/vol2col.h"
#include "operators/math/conv_func.h"
#include "operators/math/math_function.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvAddReluKernel : public OpKernelBase<DeviceType, FushionConvAddReluParam> {
public:
void Compute(const FushionConvAddReluParam &param) const;
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once
#include "operators/kernel/batchnorm_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
void BatchNormKernel<GPU_MALI, float>::Compute(const BatchNormParam &param) const {
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,6 +23,7 @@ template <>
void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
// ArmConvImplement imp;
// imp.Compute(param);
param.Output()->mutable_data<float>()[0] = 100.0;
}
template class ConvKernel<GPU_MALI, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if __ARM_NEON
#include <arm_neon.h>
#endif
#include "framework/ddim.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::DDim;
using framework::Tensor;
inline int ConvOutputSize(int input_size, int filter_size, int dilation,
int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
auto bias_ptr = bias.data<float>();
const DDim bias_ddim = bias.dims();
PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
"the bias tensor's dims size != 1")
DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
int outer_size = paddle_mobile::framework::product(outer_ddim);
int inner_size = paddle_mobile::framework::product(inner_ddim);
bias.Resize(dDim);
auto new_ptr = bias.mutable_data<float>();
int axis_size = dDim[axis];
#if __ARM_NEON
for (int i = 0; i < outer_size; ++i) {
int inner_num = inner_size >> 4;
int remain = inner_size - (inner_num << 4);
float v_bias = bias_ptr[i * axis_size / outer_size];
for (; inner_num > 0; inner_num--) {
float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
vst1q_f32(new_ptr, v_newptr1);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr2);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr3);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr4);
new_ptr += 4;
}
for (; remain > 0; remain--) {
*new_ptr = v_bias;
new_ptr++;
}
}
#else
for (int i = 0; i < outer_size; ++i) {
float v_bias = bias_ptr[i * axis_size / outer_size];
for (int j = 0; j < inner_size; ++j) {
new_ptr[i * inner_size + j] = v_bias;
}
}
#endif
}
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides,
const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
}
}
}
......@@ -175,7 +175,48 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
}
}
// 计算一个更小的 4 * 4 的 C 矩阵分块
// 分块矩阵乘法
void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
int first_time, bool relu = false) {
int Buff_A_M = m;
int Buff_B_N = n;
int _mc = m % MR;
int _nc = n % NR;
if (_mc != 0) {
Buff_A_M = m + (MR - _mc);
}
if (_nc != 0) {
Buff_B_N = n + (NR - _nc);
}
float packedA[MC * KC];
static float packedB[KC * NC];
if (first_time) {
PackMatrixB_(k, n, _nc, B, ldb, packedB);
}
PackMatrixA_(m, k, _mc, A, lda, packedA);
int i, j, mc, nc;
// B 取 4 列, 打包预热
for (j = 0; j < Buff_B_N; j += NR) {
nc = (n - j) < NR ? _nc : NR;
// A 取 4 行,打包预热
for (i = 0; i < Buff_A_M; i += MR) {
mc = (m - i) < MR ? _mc : MR;
AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc, relu);
}
}
}
//计算一个更小的 4 * 4 的 C 矩阵分块
#if defined(IOS)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) {
......@@ -226,6 +267,60 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu = false) {
// init C
float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0);
float32x4_t cv2 = vdupq_n_f32(0.0);
float32x4_t cv3 = vdupq_n_f32(0.0);
float32x4_t av;
float32x4_t bv;
float32x2_t av01;
float32x2_t av23;
for (int p = 0; p < k; p += 1) {
av = vld1q_f32(a);
bv = vld1q_f32(b);
av01 = vget_low_f32(av);
cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
av23 = vget_high_f32(av);
cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
a += MR;
b += NR;
}
float32x4x4_t cv = {cv0, cv1, cv2, cv3};
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (j == 0) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
} else if (j == 1) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
} else if (j == 2) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
} else if (j == 3) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
}
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
}
#elif defined(ARMV7)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) {
......@@ -361,6 +456,155 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu = false) {
int kc1 = k / 2, kc2 = k % 2;
int bytes_ldc = 4 * ldc;
int flag_alpha = (alpha == 1.0) ? 1 : 2;
int flag_beta;
if (beta == 0.0) {
flag_beta = 0;
} else if (beta == 1.0) {
flag_beta = 1;
} else {
flag_beta = 2;
}
asm volatile(
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"vmov.f32 q14, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"vld1.32 {q0}, [%[a]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t"
"vmla.f32 q13, q1, d1[1] \n\t"
"end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t"
"bne temp_%= \n\t"
"cmp %[nc], #4 \n\t"
"bne temp_%= \n\t"
"vmov.f32 d8[0], %[alpha] \n\t"
"vmov.f32 d8[1], %[beta] \n\t"
"cmp %[flag_alpha], #1 \n\t"
"bne alpha_%= \n\t"
"alpha_%=: \n\t"
"vmul.f32 q10, q10, d8[0] \n\t"
"vmul.f32 q11, q11, d8[0] \n\t"
"vmul.f32 q12, q12, d8[0] \n\t"
"vmul.f32 q13, q13, d8[0] \n\t"
"beta_%=: \n\t"
"cmp %[flag_beta], #0 \n\t"
"beq memory_%= \n\t"
"mov r4, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vld1.32 {q0}, [r4], r6 \n\t"
"vld1.32 {q1}, [r4], r6 \n\t"
"vld1.32 {q2}, [r4], r6 \n\t"
"vld1.32 {q3}, [r4] \n\t"
"cmp %[flag_beta], #1 \n\t"
"beq beta_eq1_%= \n\t"
"bne beta_ne1_%= \n\t"
"beta_eq1_%=: \n\t"
"vadd.f32 q10, q10, q0 \n\t"
"vadd.f32 q11, q11, q1 \n\t"
"vadd.f32 q12, q12, q2 \n\t"
"vadd.f32 q13, q13, q3 \n\t"
"b memory_%= \n\t"
"beta_ne1_%=: \n\t"
"vmla.f32 q10, q0, d8[1] \n\t"
"vmla.f32 q11, q1, d8[1] \n\t"
"vmla.f32 q12, q2, d8[1] \n\t"
"vmla.f32 q13, q3, d8[1] \n\t"
"memory_%=: \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t"
"vst1.32 {q11}, [r5], r6 \n\t"
"vst1.32 {q12}, [r5], r6 \n\t"
"vst1.32 {q13}, [r5] \n\t"
"b end_%= \n\t"
"temp_%=: \n\t"
"vst1.32 {q10, q11}, [%[ab]]!\n\t"
"vst1.32 {q12, q13}, [%[ab]] \n\t"
"end_%=: \n\t"
:
: [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13", "q14");
if (mc != MR || nc != NR) {
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
if (alpha != 1.0) {
C(i, j) = alpha * ab[i * MR + j];
} else {
C(i, j) = ab[i * MR + j];
}
} else {
if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * ab[i * MR + j];
} else {
C(i, j) += ab[i * MR + j];
}
}
if (relu) {
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
}
}
}
#else
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) {
......@@ -418,6 +662,70 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu) {
float c[16] = {0};
float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
for (int p = 0; p < k; p += 1) {
reg_b0 = *b++;
reg_b1 = *b++;
reg_b2 = *b++;
reg_b3 = *b++;
reg_a0 = *a++;
reg_a1 = *a++;
reg_a2 = *a++;
reg_a3 = *a++;
// first row
c[0] += reg_a0 * reg_b0;
c[1] += reg_a0 * reg_b1;
c[2] += reg_a0 * reg_b2;
c[3] += reg_a0 * reg_b3;
// second row
c[4] += reg_a1 * reg_b0;
c[5] += reg_a1 * reg_b1;
c[6] += reg_a1 * reg_b2;
c[7] += reg_a1 * reg_b3;
// third row
c[8] += reg_a2 * reg_b0;
c[9] += reg_a2 * reg_b1;
c[10] += reg_a2 * reg_b2;
c[11] += reg_a2 * reg_b3;
// fourth row
c[12] += reg_a3 * reg_b0;
c[13] += reg_a3 * reg_b1;
c[14] += reg_a3 * reg_b2;
c[15] += reg_a3 * reg_b3;
}
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * c[i * MR + j];
} else {
C(i, j) += c[i * MR + j];
}
if (relu) {
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
}
}
#endif
// 32位 float 矩阵乘法
......@@ -443,6 +751,34 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
}
}
void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) {
int i, j, p, mc, nc, kc;
float beta_;
for (j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
for (p = 0; p < k; p += KC) {
kc = s_min(k - p, KC);
for (i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
if (p != 0) {
beta_ = 1.0;
} else {
beta_ = beta;
}
if (p + KC >= k) {
InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
&C(i, j), ldc, i == 0, true);
} else {
InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
&C(i, j), ldc, i == 0);
}
}
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -57,10 +57,16 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
int ldb, float beta, float *C, int ldc, int mc, int nc);
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu);
// 32位 float 矩阵乘法
void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc);
void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc);
// 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
const double *B, int ldb, float beta, double *C, int ldc);
......
......@@ -22,7 +22,7 @@ namespace math {
template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta) {
framework::Tensor *matrix_out, float beta, bool relu) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -41,14 +41,20 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
if (relu) {
sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
} else {
sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
}
}
template <>
void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
double alpha, framework::Tensor *matrix_out, double beta) {
double alpha, framework::Tensor *matrix_out, double beta, bool relu) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -68,6 +74,8 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -25,7 +25,7 @@ namespace math {
template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta);
framework::Tensor *matrix_out, T beta, bool relu = false);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -823,7 +823,7 @@ class FushionConvAddParam : public OpParam {
const int &Groups() const { return groups; }
private:
protected:
Tensor *bias_;
int axis_;
Tensor *input_;
......@@ -838,5 +838,15 @@ class FushionConvAddParam : public OpParam {
Print &operator<<(Print &printer, const FushionConvAddParam &conv_param);
#endif
#ifdef FUSION_CONVADD_RELU_OP
class FushionConvAddReluParam: public FushionConvAddParam {
public:
FushionConvAddReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope): FushionConvAddParam(inputs, outputs, attrs, scope) {
}
};
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -141,6 +141,10 @@ else ()
ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenet paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif()
......@@ -42,8 +42,9 @@ using std::vector;
template <typename DeviceType, typename OpType>
class Executor4Test : public Executor<DeviceType> {
public:
Executor4Test(Program<DeviceType> p, string op_type)
Executor4Test(Program<DeviceType> p, string op_type, bool use_optimize = false)
: Executor<DeviceType>() {
this->use_optimize_ = use_optimize;
this->program_ = p;
if (this->use_optimize_) {
this->to_predict_program_ = this->program_.optimizeProgram;
......@@ -61,10 +62,13 @@ class Executor4Test : public Executor<DeviceType> {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (std::shared_ptr<OpDesc> op : ops) {
if (op->Type() == op_type) {
DLOG << "匹配到: " << op->Type();
/// test first meeting op in program
std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
op_ptr = paddle_mobile::framework::OpRegistry<
paddle_mobile::CPU>::CreateOp(op->Type(), op->GetInputs(),
DeviceType>::CreateOp(op->Type(), op->GetInputs(),
op->GetOutputs(),
op->GetAttrMap(),
this->program_.scope);
......
......@@ -20,8 +20,9 @@ int main() {
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto program = loader.Load(g_mobilenet_ssd, false, false);
// loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
// "/params", true);
program.originProgram->Description("program desc: ");
// auto program = loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
// "/params", true);
// program.originProgram->Description("program desc: ");
return 0;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/fusion_conv_add_relu_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_googlenet, true);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
executor(program, "fusion_conv_add_relu", true);
paddle_mobile::framework::Tensor input;
GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
auto output_ptr = output->data<float>();
for (int j = 0; j < 25; ++j) {
DLOG << " value of output: " << output_ptr[j];
}
return 0;
}
......@@ -16,15 +16,15 @@ limitations under the License. */
#include "operators/conv_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
// ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_googlenet);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::ConvOp<paddle_mobile::CPU, float>>
Executor4Test<paddle_mobile::GPU_MALI,
paddle_mobile::operators::ConvOp<paddle_mobile::GPU_MALI, float>>
executor(program, "conv2d");
paddle_mobile::framework::Tensor input;
......@@ -37,7 +37,7 @@ int main() {
auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
auto output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) {
for (int j = 0; j < 20; ++j) {
DLOG << " value of output: " << output_ptr[j];
}
return 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册