diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index dc18c983bbbc104e11b91333868cc6479ed4c69b..1eba4cd3304b945a09c2f48131abe24f5c07ab07 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -77,15 +77,15 @@ void ConvKernel::Compute(const ConvParam ¶m) { break; case ConvParam::EXEC_DEPTHWISE3x3S1P1_FLOAT: math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), - nullptr, false); + nullptr, false, false); break; case ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT: math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), nullptr, false); + param.Output(), nullptr, false, false); break; case ConvParam::EXEC_DEPTHWISE3x3S2P0_FLOAT: math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), - nullptr, false); + nullptr, false, false); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h index ebc014da4f841bb90ccd6a1582a3d3043141c151..c012f74b2b66d3b191e9a3d9dcef3d0c0988b8f6 100644 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -122,7 +122,7 @@ void ConvAddCompute(const FusionConvAddParam ¶m) { param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), - param.Bias(), true); + param.Bias(), true, false); } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && @@ -133,10 +133,10 @@ void ConvAddCompute(const FusionConvAddParam ¶m) { // param.Output(), false); if (param.Paddings()[0] == 0) { math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), - param.Bias(), true); + param.Bias(), true, false); } else { math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), param.Bias(), true); + param.Output(), param.Bias(), true, false); } } else { ConvAddBasic(param); diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h index 9f251b3d7189b36b13ae9ccc27c55b136d8ab511..04a84fc9765bf233e331bcbd8186117c104fac31 100644 --- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDRELU_OP #pragma once +#include #include #include "operators/math/conv_func.h" #include "operators/math/im2col.h" @@ -26,7 +27,7 @@ namespace paddle_mobile { namespace operators { template -void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { +void ConvAddReluBasic(const FusionConvAddReluParam ¶m) { const Tensor *input = param.Input(); Tensor filter = *param.Filter(); Tensor bias = *param.Bias(); @@ -118,6 +119,34 @@ void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { } } +template +void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { + param.Output()->mutable_data(); + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { + math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), + param.Bias(), true, true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConv3x3(param.Input(), param.Strides(), + // param.Paddings(), + // param.Filter(), param.Bias(), + // param.Output(), false); + if (param.Paddings()[0] == 0) { + math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), + param.Bias(), true, true); + } else { + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.Bias(), true, true); + } + } else { + ConvAddReluBasic(param); + } +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/depthwise_conv3x3.cpp b/src/operators/math/depthwise_conv3x3.cpp index a4466a52fac228812e8df205a61bdb594775d327..3187c6c1411d861ef27d2daae19f8a9667ba1519 100644 --- a/src/operators/math/depthwise_conv3x3.cpp +++ b/src/operators/math/depthwise_conv3x3.cpp @@ -251,7 +251,7 @@ void DepthwiseConv3x3(const framework::Tensor *input, void DepthwiseConv3x3s1p1(const framework::Tensor *input, const framework::Tensor *filter, framework::Tensor *output, framework::Tensor *bias, - bool if_bias) { + bool if_bias, bool if_relu) { #if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); @@ -268,6 +268,15 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, const int c = static_cast(input->dims()[1]); const int hxw = h * w; float32x4_t vbias = vdupq_n_f32(0.0); + + // leftTop, rightTop, leftBottom, rightBottom + int lt = 0; + int rt = w - 1; + int lb = (h - 1) * w; + int rb = h * w - 1; + + float32x4_t zero = vdupq_n_f32(0.0); + for (int b = 0; b < batch_size; ++b) { const float *filter_data_tmp = filter_data; @@ -287,39 +296,51 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, float w21 = filter_data_tmp[7]; float w22 = filter_data_tmp[8]; - output_data[0] = w11 * input_data[0] + w12 * input_data[1] + - w21 * input_data[w] + w22 * input_data[w + 1]; - output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] + - w20 * input_data[2 * w - 2] + - w21 * input_data[2 * w - 1]; - output_data[(h - 1) * w] = + output_data[lt] = w11 * input_data[0] + w12 * input_data[1] + + w21 * input_data[w] + w22 * input_data[w + 1]; + output_data[rt] = w10 * input_data[w - 2] + w11 * input_data[w - 1] + + w20 * input_data[2 * w - 2] + + w21 * input_data[2 * w - 1]; + output_data[lb] = w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; - output_data[h * w - 1] = + output_data[rb] = w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] + w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1]; if (if_bias) { - output_data[0] += bias_data[j]; - output_data[w - 1] += bias_data[j]; - output_data[(h - 1) * w] += bias_data[j]; - output_data[h * w - 1] += bias_data[j]; + output_data[lt] += bias_data[j]; + output_data[rt] += bias_data[j]; + output_data[lb] += bias_data[j]; + output_data[rb] += bias_data[j]; + } + if (if_relu) { + output_data[lt] = output_data[lt] < 0 ? 0 : output_data[lt]; + output_data[rt] = output_data[rt] < 0 ? 0 : output_data[rt]; + output_data[lb] = output_data[lb] < 0 ? 0 : output_data[lb]; + output_data[rb] = output_data[rb] < 0 ? 0 : output_data[rb]; } for (int i = 1; i < h - 1; ++i) { - output_data[i * w] = + int left = i * w; + int right = i * w + w - 1; + output_data[left] = w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; - output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + - w01 * input_data[i * w + w - 1 - w] + - w10 * input_data[i * w + w - 1 - 1] + - w11 * input_data[i * w + w - 1] + - w20 * input_data[i * w + w - 1 + w - 1] + - w21 * input_data[i * w + w - 1 + w]; + output_data[right] = w00 * input_data[i * w + w - 1 - w - 1] + + w01 * input_data[i * w + w - 1 - w] + + w10 * input_data[i * w + w - 1 - 1] + + w11 * input_data[i * w + w - 1] + + w20 * input_data[i * w + w - 1 + w - 1] + + w21 * input_data[i * w + w - 1 + w]; if (if_bias) { - output_data[i * w] += bias_data[j]; - output_data[i * w + w - 1] += bias_data[j]; + output_data[left] += bias_data[j]; + output_data[right] += bias_data[j]; + } + if (if_relu) { + output_data[left] = output_data[left] < 0 ? 0 : output_data[left]; + output_data[right] = output_data[right] < 0 ? 0 : output_data[right]; } } @@ -352,7 +373,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp3, w22); out0 = vaddq_f32(out0, vbias); - + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } vst1q_f32(output_ptr, out0); in5 = vld1q_f32(input_tmp_end + 4); @@ -370,7 +393,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vaddq_f32(out0, vbias); - + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } vst1q_f32(output_ptr + (h - 1) * w, out0); // can optimize to each 8 stride. @@ -399,6 +424,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp3, w22); out0 = vaddq_f32(out0, vbias); + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } for (int i = 0; i < c_mid; ++i) { if (i == 0) { @@ -428,6 +456,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vaddq_f32(out0, vbias); + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } for (int i = 0; i < c_mid; ++i) { if (i == 0) { @@ -471,6 +502,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp5, w22); out0 = vaddq_f32(out0, vbias); + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } vst1q_f32(output_ptr, out0); @@ -502,6 +536,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp5, w22); out0 = vaddq_f32(out0, vbias); + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } for (int i = 0; i < c_mid; ++i) { if (i == 0) { @@ -1273,7 +1310,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input, void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, const framework::Tensor *filter, framework::Tensor *output, framework::Tensor *bias, - bool if_bias) { + bool if_bias, bool if_relu) { #if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); @@ -1361,6 +1398,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), vaddq_f32(elewise_res0, elewise_res1)); res3 = vaddq_f32(res3, vbias); + if (if_relu) { + res3 = vmaxq_f32(res3, zero); + } vst1q_f32(output_row_ptr, res3); input_row_ptr += 6; @@ -1395,6 +1435,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), vaddq_f32(elewise_res0, elewise_res1)); res3 = vaddq_f32(res3, vbias); + if (if_relu) { + res3 = vmaxq_f32(res3, zero); + } if ((w4 != w_times)) { vst1q_f32(output_row_ptr, res3); @@ -1410,12 +1453,18 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, output_row_ptr += 3; } - output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + - input_const[in_w] * w21 + - input_const[in_w + 1] * w22; + // leftTop, rightTop, leftBottom, rightBottom + int lt = 0; + int rt = out_w - 1; + int lb = out_w * (out_h - 1); + int rb = out_h * out_w - 1; + + output_data_tmp[lt] = input_const[0] * w11 + input_const[1] * w12 + + input_const[in_w] * w21 + + input_const[in_w + 1] * w22; out2in_mid = (out_w - 1) * 2; - output_data_tmp[out_w - 1] = + output_data_tmp[rt] = w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + @@ -1424,7 +1473,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, out2in_mid = (out_h - 1) * 2 * in_w; - output_data_tmp[out_w * (out_h - 1)] = + output_data_tmp[lb] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + @@ -1432,7 +1481,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, w22 * input_const[out2in_mid + in_w + 1]); out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2; - output_data_tmp[out_h * out_w - 1] = + output_data_tmp[rb] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + @@ -1443,22 +1492,30 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, (1 - if_pad_r) * (1 - if_pad_b) * w22 * input_const[out2in_mid + in_w + 1]; if (if_bias) { - output_data_tmp[0] += bias_data[j]; - output_data_tmp[out_w - 1] += bias_data[j]; - output_data_tmp[out_w * (out_h - 1)] += bias_data[j]; - output_data_tmp[out_h * out_w - 1] += bias_data[j]; + output_data_tmp[lt] += bias_data[j]; + output_data_tmp[rt] += bias_data[j]; + output_data_tmp[lb] += bias_data[j]; + output_data_tmp[rb] += bias_data[j]; + } + if (if_relu) { + output_data_tmp[lt] = output_data_tmp[lt] < 0 ? 0 : output_data_tmp[lt]; + output_data_tmp[rt] = output_data_tmp[rt] < 0 ? 0 : output_data_tmp[rt]; + output_data_tmp[lb] = output_data_tmp[lb] < 0 ? 0 : output_data_tmp[lb]; + output_data_tmp[rb] = output_data_tmp[rb] < 0 ? 0 : output_data_tmp[rb]; } for (int i = 1; i < out_h - 1; i++) { out2in_mid = i * 2 * in_w; - output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] + - w02 * input_const[out2in_mid - in_w + 1] + - w11 * input_const[out2in_mid] + - w12 * input_const[out2in_mid + 1] + - w21 * input_const[out2in_mid + in_w] + - w22 * input_const[out2in_mid + in_w + 1]; + int left = i * out_w; + output_data_tmp[left] = w01 * input_const[out2in_mid - in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w11 * input_const[out2in_mid] + + w12 * input_const[out2in_mid + 1] + + w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]; out2in_mid = i * 2 * in_w + (out_w - 1) * 2; - output_data_tmp[i * out_w + out_w - 1] = + int right = i * out_w + out_w - 1; + output_data_tmp[right] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + @@ -1468,8 +1525,14 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, w12 * input_const[out2in_mid + 1] + w22 * input_const[out2in_mid + in_w + 1]); if (if_bias) { - output_data_tmp[i * out_w] += bias_data[j]; - output_data_tmp[i * out_w + out_w - 1] += bias_data[j]; + output_data_tmp[left] += bias_data[j]; + output_data_tmp[right] += bias_data[j]; + } + if (if_relu) { + output_data_tmp[left] = + output_data_tmp[left] < 0 ? 0 : output_data_tmp[left]; + output_data_tmp[right] = + output_data_tmp[right] < 0 ? 0 : output_data_tmp[right]; } } filter_data_tmp += 9; @@ -1909,7 +1972,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, void DepthwiseConv3x3s2p0(const framework::Tensor *input, const framework::Tensor *filter, framework::Tensor *output, framework::Tensor *bias, - bool if_bias) { + bool if_bias, bool if_relu) { #if __ARM_NEON const int batch_size = static_cast(input->dims()[0]); @@ -1977,6 +2040,9 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, if (if_bias) { out0 = vaddq_f32(out0, biasv); } + if (if_relu) { + out0 = vmaxq_f32(out0, zero); + } vst1q_lane_f32(output_ptr, out0, 0); vst1q_lane_f32(output_ptr + 1, out0, 1); vst1q_lane_f32(output_ptr + 2, out0, 2); @@ -1985,7 +2051,8 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, for (m = 0; m < output_width - 2; m += 3) { } for (int j = m; j < output_width; j++) { - output_data[i * output_width + j] = + int index = i * output_width + j; + output_data[index] = input_data[(2 * i) * input_width + 2 * j] * w00 + input_data[(2 * i) * input_width + 2 * j + 1] * w01 + input_data[(2 * i) * input_width + 2 * j + 2] * w02 + @@ -1996,7 +2063,11 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 + input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22; if (if_bias) { - output_data[i * output_width + j] += *bias_data; + output_data[index] += *bias_data; + } + if (if_relu) { + output_data[index] = + output_data[index] < 0 ? 0 : output_data[index]; } } } diff --git a/src/operators/math/depthwise_conv3x3.h b/src/operators/math/depthwise_conv3x3.h index ca8f45fa5186fc1a2642a53f27526c6898bfb8e3..fde5d878c8a62c167af7a3359a991f77d3d3fce5 100644 --- a/src/operators/math/depthwise_conv3x3.h +++ b/src/operators/math/depthwise_conv3x3.h @@ -32,7 +32,7 @@ void DepthwiseConv3x3(const framework::Tensor *input, void DepthwiseConv3x3s1p1(const framework::Tensor *input, const framework::Tensor *filter, framework::Tensor *output, framework::Tensor *bias, - bool if_bias); + bool if_bias, bool if_relu); void DepthwiseConvAddBNRelu3x3s1p1(const framework::Tensor *input, const framework::Tensor *filter, @@ -51,7 +51,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input, void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, const framework::Tensor *filter, framework::Tensor *output, framework::Tensor *bias, - bool if_bias); + bool if_bias, bool if_relu); void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, const framework::Tensor *filter, @@ -63,7 +63,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, void DepthwiseConv3x3s2p0(const framework::Tensor *input, const framework::Tensor *filter, framework::Tensor *output, framework::Tensor *bias, - bool if_bias); + bool if_bias, bool if_relu); // TODO(hjchen2) need to be implemented // template diff --git a/tools/build.sh b/tools/build.sh index 6e96404c3eac36de53b810d563720d485816f0f9..f83a7fe1876b014a12f0acfa8ddbef61d953ceda 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -162,7 +162,7 @@ build_for_ios() { fi cd "${BUILD_DIR}" make -j 8 - cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h + cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h cd ./build # 生成符号表 ranlib *.a