diff --git a/src/io/executor.cpp b/src/io/executor.cpp index c12f1ce02c8ab32d04d00d76cad5dc7d6ce45bc2..30581abb2e42f9f28bbf9e9c3ba01be0964d4d56 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -231,6 +231,13 @@ void Executor::InitMemory() { Get_binary_data(program_.model_path + "/" + var_desc->Name()); char *data = origin_data; LoadMemory(*var_desc, tensor, &data); + + // DLOG << "----- " << var_desc->Name(); + // DLOG << "----- " << tensor->dims(); + // float *pDouble = tensor->template data(); + // for (int i = 0; i < tensor->numel() && i < 30; ++i) { + // std::cout << pDouble[i] << std::endl; + // } delete origin_data; } else { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h index 643ee84529e01aebc33a144b4c7a8181ff39a1c9..d71bc235977236fbd0dd332df556ea4bd41eacf4 100644 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -129,10 +129,13 @@ void ConvAddCompute(const FusionConvAddParam ¶m) { // param.Paddings(), // param.Filter(), param.Bias(), // param.Output(), false); - - math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(), + if (param.Paddings()[0] == 0) { + math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), *param.Bias(), true); - + } else { + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), *param.Bias(), true); + } } else { ConvAddBasic(param); } diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index 402b187f8f5e9d2fbb70fa6bcfb72c88aa53e3d3..716256a376a50f2ec1c4c62fa25703cabf3a0c66 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -1881,6 +1881,103 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, #endif } +void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter, + Tensor *output, Tensor bias, bool if_bias) { +#if __ARM_NEON + + const int batch_size = static_cast(input->dims()[0]); + const int input_channel = static_cast(input->dims()[1]); + + const int input_height = static_cast(input->dims()[2]); + const int input_width = static_cast(input->dims()[3]); + const int output_height = static_cast(output->dims()[2]); + const int output_width = static_cast(output->dims()[3]); + const int inhxw = input_height * input_width; + const int outhxw = output_height * output_width; + + float32x4_t zero = vdupq_n_f32(0.0); + for (int b = 0; b < batch_size; b++) { +#pragma omp parallel for + for (int c = 0; c < input_channel; c++) { + const float *filter_data = filter->data() + c * 9; + const float *input_data = input->data() + c * inhxw; + const float *bias_data = bias.data() + c; + float *output_data = output->data() + c * outhxw; + float w00 = filter_data[0]; + float w01 = filter_data[1]; + float w02 = filter_data[2]; + float w10 = filter_data[3]; + float w11 = filter_data[4]; + float w12 = filter_data[5]; + float w20 = filter_data[6]; + float w21 = filter_data[7]; + float w22 = filter_data[8]; + + float32x4_t biasv = vld1q_dup_f32(bias_data); + + for (int i = 0; i < output_height; i += 1) { + for (int m = 0; m < output_width - 2; m += 3) { + float *output_ptr = output_data + i * output_width + m; + float32x4x2_t input_buff_top{}, input_buff_mid{}, input_buff_bottom{}; + float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, + tmp4, tmp5, out0; + input_buff_top = + vld2q_f32(input_data + (2 * i) * input_width + (2 * m)); + input_buff_mid = + vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m)); + input_buff_bottom = + vld2q_f32(input_data + (2 * i + 2) * input_width + (2 * m)); + + in0 = input_buff_top.val[0]; + tmp0 = input_buff_top.val[1]; + tmp1 = vextq_f32(in0, zero, 1); + + in2 = input_buff_mid.val[0]; + tmp2 = input_buff_mid.val[1]; + tmp3 = vextq_f32(in2, zero, 1); + + in4 = input_buff_bottom.val[0]; + tmp4 = input_buff_bottom.val[1]; + tmp5 = vextq_f32(in4, zero, 1); + + out0 = vmulq_n_f32(in0, w00); + out0 = vmlaq_n_f32(out0, tmp0, w01); + out0 = vmlaq_n_f32(out0, tmp1, w02); + out0 = vmlaq_n_f32(out0, in2, w10); + out0 = vmlaq_n_f32(out0, tmp2, w11); + out0 = vmlaq_n_f32(out0, tmp3, w12); + out0 = vmlaq_n_f32(out0, in4, w20); + out0 = vmlaq_n_f32(out0, tmp4, w21); + out0 = vmlaq_n_f32(out0, tmp5, w22); + out0 = vaddq_f32(out0, biasv); + + vst1q_lane_f32(output_ptr, out0, 0); + vst1q_lane_f32(output_ptr + 1, out0, 1); + vst1q_lane_f32(output_ptr + 2, out0, 2); + } + int m; + for (m = 0; m < output_width - 2; m += 3) { + } + for (int j = m; j < output_width; j++) { + output_data[i * output_width + j] = + input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 + + input_data[(2 * i - 1) * input_width + 2 * j] * w01 + + input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 + + input_data[(2 * i) * input_width + 2 * j - 1] * w10 + + input_data[(2 * i) * input_width + 2 * j] * w11 + + input_data[(2 * i) * input_width + 2 * j + 1] * w12 + + input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 + + input_data[(2 * i + 1) * input_width + 2 * j] * w21 + + input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22; + output_data[i * output_width + j] += *bias_data; + } + } + } + } + +#endif +} + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h index 60e979648f871e640924a3373c625c311c3dd067..b146b88e737a07ea08250315fc94653f63d2ad05 100644 --- a/src/operators/math/depthwise_conv_3x3.h +++ b/src/operators/math/depthwise_conv_3x3.h @@ -43,6 +43,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu); + +void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter, + Tensor *output, Tensor bias, bool if_bias); } // namespace math } // namespace operators } // namespace paddle_mobile