diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index 3779989fd8908ac5277261e58f287158e2d2d19c..ea7e611e8eba8359de66b4a3e62bca39f25d82f4 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -1466,9 +1466,6 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, const Tensor *new_bias, bool if_relu) { #if __ARM_NEON #ifdef _OPENMP - const float *input_data = input->data(); - const float *filter_data = filter->data(); - float *output_data = output->data(); const float *newscale_data = new_scale->data(); const float *newbias_data = new_bias->data(); @@ -1482,14 +1479,15 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, const int inhxw = input_height * input_width; const int outhxw = output_height * output_width; - float32x4_t vnewbias = vdupq_n_f32(0.0); - float32x4_t vnewscale = vdupq_n_f32(1.0); float32x4_t zero = vdupq_n_f32(0.0); for (int b = 0; b < batch_size; b++) { - filter_data = filter->data(); + #pragma omp parallel for for (int c = 0; c < input_channel; c++) { - vnewbias = vdupq_n_f32(newbias_data[c]); - vnewscale = vdupq_n_f32(newscale_data[c]); + const float *filter_data = filter->data() + c * 9; + const float *input_data = input->data() + c * inhxw; + float *output_data = output->data() + c * outhxw; + float32x4_t vnewbias = vdupq_n_f32(newbias_data[c]); + float32x4_t vnewscale = vdupq_n_f32(newscale_data[c]); float w00 = filter_data[0]; float w01 = filter_data[1]; @@ -1527,7 +1525,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, if (if_relu) { out0 = vmaxq_f32(out0, zero); } - vst1q_f32(output_ptr, out0); + vst1q_lane_f32(output_ptr, out0, 0); + vst1q_lane_f32(output_ptr + 1, out0, 1); + vst1q_lane_f32(output_ptr + 2, out0, 2); } for (m = 1; m < output_width - 2; m += 3) { } @@ -1543,8 +1543,6 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, } } -#pragma omp parallel for - for (int i = 1; i < output_height; i += 1) { for (int m = 1; m < output_width - 2; m += 3) { float *output_ptr = output_data + i * output_width + m; @@ -1583,7 +1581,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, if (if_relu) { out0 = vmaxq_f32(out0, zero); } - vst1q_f32(output_ptr, out0); + vst1q_lane_f32(output_ptr, out0, 0); + vst1q_lane_f32(output_ptr + 1, out0, 1); + vst1q_lane_f32(output_ptr + 2, out0, 2); } int m; for (m = 1; m < output_width - 2; m += 3) { @@ -1635,10 +1635,6 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, : output_data[i * output_width]; } } - - input_data = input_data + inhxw; - output_data = output_data + outhxw; - filter_data = filter_data + 9; } }