diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h index 13fe50bf74ee164c2cc663f5a6a9eeddbfa3804b..f184a59a3abbe463f75778c4db216a306c315e8d 100644 --- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h @@ -108,7 +108,7 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, - static_cast(1)); + static_cast(0)); } } /// todo : use neon in special case instead of 2for(300ms) @@ -131,15 +131,16 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam ¶m) { param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { - math::DepthwiseConvAddBNRelu3x3s1p1( - param.Input(), param.Filter(), param.Output(), &Bias, 1, - param.NewScale(), param.NewBias(), 1, 1); - } else if (0 && param.Groups() == param.Input()->dims()[1] && + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), 1); + } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), - param.Filter(), &Bias, param.Output(), false); + math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), 1); } else { ConvAddBNReluBasic(param); } diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index f23affb45107b0d2414c49843cdfbd70c953c95c..d1faa1fd641ffd771b5970165d51794a42655670 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -514,14 +514,11 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, } void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, - Tensor *output, Tensor *bias, bool if_bias, - const Tensor *new_scale, - const Tensor *new_bias, bool if_bn, - bool if_relu) { + Tensor *output, const Tensor *new_scale, + const Tensor *new_bias, bool if_relu) { const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); - const float *bias_data = bias->data(); const float *newscale_data = new_scale->data(); const float *newbias_data = new_bias->data(); @@ -532,7 +529,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); const int hxw = h * w; - float32x4_t vbias = vdupq_n_f32(0.0); float32x4_t vnewbias = vdupq_n_f32(0.0); float32x4_t vnewscale = vdupq_n_f32(1.0); float32x4_t vzero = vdupq_n_f32(0); @@ -541,13 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, const float *filter_data_tmp = filter_data; for (int j = 0; j < c; ++j) { - if (if_bias) { - vbias = vdupq_n_f32(bias_data[j]); - } - if (if_bn) { - vnewbias = vdupq_n_f32(newbias_data[j]); - vnewscale = vdupq_n_f32(newscale_data[j]); - } + vnewbias = vdupq_n_f32(newbias_data[j]); + vnewscale = vdupq_n_f32(newscale_data[j]); + int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 float w00 = filter_data_tmp[0]; float w01 = filter_data_tmp[1]; @@ -573,21 +565,14 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, w01 * input_data[(l - 2) * (l + 1) + 1] + w10 * input_data[l * l - 2] + w11 * input_data[l * l - 1]; - if (if_bias) { - output_data[0] += bias_data[j]; - output_data[l - 1] += bias_data[j]; - output_data[(l - 1) * l] += bias_data[j]; - output_data[l * l - 1] += bias_data[j]; - } - if (if_bn) { - output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j]; - output_data[l - 1] = - output_data[l - 1] * newscale_data[j] + newbias_data[j]; - output_data[(l - 1) * l] = - output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j]; - output_data[l * l - 1] = - output_data[l * l - 1] * newscale_data[j] + newbias_data[j]; - } + output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j]; + output_data[l - 1] = + output_data[l - 1] * newscale_data[j] + newbias_data[j]; + output_data[(l - 1) * l] = + output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j]; + output_data[l * l - 1] = + output_data[l * l - 1] * newscale_data[j] + newbias_data[j]; + if (if_relu) { output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1]; @@ -607,16 +592,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, w11 * input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21 * input_data[i * l + l - 1 + l]; - if (if_bias) { - output_data[i * l] += bias_data[j]; - output_data[i * l + l - 1] += bias_data[j]; - } - if (if_bn) { - output_data[i * l] = - output_data[i * l] * newscale_data[j] + newbias_data[j]; - output_data[i * l + l - 1] = - output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j]; - } + output_data[i * l] = + output_data[i * l] * newscale_data[j] + newbias_data[j]; + output_data[i * l + l - 1] = + output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j]; + if (if_relu) { output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l]; output_data[i * l + l - 1] = @@ -652,7 +632,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in2, w20); out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp3, w22); - out0 = vaddq_f32(out0, vbias); out0 = vmlaq_f32(vnewbias, vnewscale, out0); if (if_relu) { out0 = vmaxq_f32(out0, vzero); @@ -673,7 +652,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in6, w10); out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp3, w12); - out0 = vaddq_f32(out0, vbias); out0 = vmlaq_f32(vnewbias, vnewscale, out0); if (if_relu) { out0 = vmaxq_f32(out0, vzero); @@ -705,7 +683,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in2, w20); out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp3, w22); - out0 = vaddq_f32(out0, vbias); out0 = vmlaq_f32(vnewbias, vnewscale, out0); if (if_relu) { out0 = vmaxq_f32(out0, vzero); @@ -737,7 +714,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in6, w10); out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp3, w12); - out0 = vaddq_f32(out0, vbias); out0 = vmlaq_f32(vnewbias, vnewscale, out0); if (if_relu) { out0 = vmaxq_f32(out0, vzero); @@ -783,7 +759,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in4_tmp, w20); out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp5, w22); - out0 = vaddq_f32(out0, vbias); out0 = vmlaq_f32(vnewbias, vnewscale, out0); if (if_relu) { out0 = vmaxq_f32(out0, vzero); @@ -817,7 +792,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in4_tmp, w20); out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp5, w22); - out0 = vaddq_f32(out0, vbias); out0 = vmlaq_f32(vnewbias, vnewscale, out0); if (if_relu) { out0 = vmaxq_f32(out0, vzero); @@ -840,6 +814,202 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } } } + +void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, + Tensor *output, const Tensor *new_scale, + const Tensor *new_bias, bool if_relu) { + const int batch_size = input->dims()[0]; + + const int input_height = input->dims()[2]; + + const int input_width = input->dims()[3]; + + const int output_channels = output->dims()[1]; + + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const int _kernel_size = 3; + const int stride_height = 2; + const int stride_width = 2; + const int padding_height = 1; + const int padding_width = 1; + const float zero = 0; + const int input_channel_stride = input_height * input_width; + const int output_channel_stride = output_height * output_width; + const int filter_channel_stride = 9; + const float *newscale_data = new_scale->data(); + const float *newbias_data = new_bias->data(); + + const float *input_data = input->data(); + const float *filter_data = filter->data(); + + float *output_data = output->mutable_data(); + + const int input_batch_stride = output_channels * input_channel_stride; + const int output_batch_stride = output_channels * output_channel_stride; + const int filter_batch_stride = output_channels * output_channel_stride; + const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr; + int hstart, wstart, hend, wend; + float result; + for (int i = 0; i < batch_size; ++i) { + for (int c = 0; c < output_channels; ++c) { + filter1 = filter_data; + filter2 = filter1 + 3; + filter3 = filter2 + 3; + + for (int ph = 0; ph < output_height; ph++) { + for (int pw = 0; pw < output_width; pw++) { + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + hend = min(hstart + _kernel_size, input_height + padding_height); + wend = min(wstart + _kernel_size, input_width + padding_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, input_height); + wend = min(wend, input_width); + pos1 = input_data + hstart * input_width + wstart; + pos2 = input_data + (hstart + 1) * input_width + wstart; + pos3 = input_data + (hstart + 2) * input_width + wstart; + output_ptr = output_data + ph * output_width + pw; + + if (hend - hstart != 3 || wend - wstart != 3) { + result = 0; + float fake_input[9] = {0}; + if (hstart == 0 && wstart == 0) { + // 左上角 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (j >= 3 - hend && k >= 3 - wend) { + fake_input[3 * j + k] = + input_data[(j - (3 - hend)) * input_width + k - + (3 - wend)]; + } + } + } + } else if (hstart == 0 && wend == input_width) { + // 右上角 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (j >= 3 - hend && k <= input_width - wstart - 1) { + fake_input[3 * j + k] = + input_data[(j - (3 - hend)) * input_width + k + wstart]; + } + } + } + + } else if (hend == input_height && wstart == 0) { + // 左下角 + + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (j <= input_height - 1 - hstart && k >= 3 - wend) { + fake_input[3 * j + k] = + input_data[(j + hstart) * input_width + k - (3 - wend)]; + } + } + } + } else if (hend == input_height && wend == input_width) { + // 右下角 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (j <= input_height - hstart - 1 && + k <= input_width - wstart - 1) { + fake_input[3 * j + k] = + input_data[(j + hstart) * input_width + k + wstart]; + } + } + } + } else if (hstart == 0) { + // 顶部 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (j >= 3 - hend) { + fake_input[3 * j + k] = + input_data[(j - (3 - hend)) * input_width + k + wstart]; + } + } + } + + } else if (hend == input_height) { + // 底部 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (j <= input_height - hstart - 1) { + fake_input[3 * j + k] = + input_data[(j + hstart) * input_width + k + wstart]; + } + } + } + + } else if (wstart == 0) { + // 左侧 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (k >= 3 - wend) { + fake_input[3 * j + k] = + input_data[(j + hstart) * input_width + + (k - (3 - wend))]; + } + } + } + + } else if (wend == input_width) { + // 右侧 + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 3; ++k) { + if (k <= input_width - wstart - 1) { + fake_input[3 * j + k] = + input_data[(j + hstart) * input_width + k + wstart]; + } + } + } + } + for (int l = 0; l < 9; ++l) { + result += fake_input[l] * filter1[l]; + } + output_data[ph * output_width + pw] = + newscale_data[c] * result + newbias_data[c]; + + if (if_relu) { + output_data[ph * output_width + pw] = + output_data[ph * output_width + pw] < 0 + ? 0 + : output_data[ph * output_width + pw]; + } + } else { + const float32x4_t data1 = vld1q_f32(pos1); + const float32x4_t data2 = vld1q_f32(pos2); + const float32x4_t data3 = vld1q_f32(pos3); + + const float32x4_t v_filter1 = vld1q_f32(filter1); + const float32x4_t v_filter2 = vld1q_f32(filter2); + const float32x4_t v_filter3 = vld1q_f32(filter3); + float32x4_t mula = vmulq_f32(data1, v_filter1); + mula = vmlaq_f32(mula, data2, v_filter2); + mula = vmlaq_f32(mula, data3, v_filter3); + float32x2_t res = vpadd_f32( + vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula)); + res = vpadd_f32(res, res); + output_data[ph * output_width + pw] = + vget_lane_f32(res, 0) * newscale_data[c] + newbias_data[c]; + + if (if_relu) { + output_data[ph * output_width + pw] = + output_data[ph * output_width + pw] < 0 + ? 0 + : output_data[ph * output_width + pw]; + } + } + } + } + input_data += input_channel_stride; + output_data += output_channel_stride; + filter_data += filter_channel_stride; + } + input_data += input_batch_stride; + output_data += output_batch_stride; + } +} } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h index a0beb479926902a71b7e06128aa8cecdd5443196..b5103a53ad7d988b8d18a57cb0f0d8d4bb9fee0f 100644 --- a/src/operators/math/depthwise_conv_3x3.h +++ b/src/operators/math/depthwise_conv_3x3.h @@ -33,10 +33,11 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, Tensor *output, Tensor *bias, bool if_bias); void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, - Tensor *output, Tensor *bias, bool if_bias, - const Tensor *new_scale, - const Tensor *new_bias, bool if_bn, - bool if_relu); + Tensor *output, const Tensor *new_scale, + const Tensor *new_bias, bool if_relu); +void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, + Tensor *output, const Tensor *new_scale, + const Tensor *new_bias, bool if_relu); } // namespace math } // namespace operators } // namespace paddle_mobile