diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h index 753ce395980c9186c948bc1ae4c89c3d2c417fdc..389c76cc83a532fe706d911903a8412bb8bfb4ca 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.h +++ b/src/operators/fusion_conv_add_bn_relu_op.h @@ -79,11 +79,11 @@ class FusionConvAddBNReluOp #ifdef PADDLE_MOBILE_CPU -//#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER -// static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar( -// new FusionConvAddBNReluMatcher()); -//#define FUSION_CONV_ADD_BN_RELU_REGISTER -//#endif +#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar( + new FusionConvAddBNReluMatcher()); +#define FUSION_CONV_ADD_BN_RELU_REGISTER +#endif #endif diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h index 71ce3b61a928e252923822e934c65abdc9962a43..9dfe7a6fa8b3979b82e013cd2e7a7e00b21e6a26 100644 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -152,9 +152,15 @@ void ConvAddCompute(const FusionConvAddParam ¶m) { } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3) { - math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), - param.Filter(), param.Bias(), param.Output(), true); + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConv3x3(param.Input(), param.Strides(), + // param.Paddings(), + // param.Filter(), param.Bias(), + // param.Output(), false); + + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(), + *param.Bias(), true); + } else { ConvAddBasic(param); } diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h index f184a59a3abbe463f75778c4db216a306c315e8d..fb49a33c67face81a2615516bffd6aa151868fe3 100644 --- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h @@ -26,8 +26,6 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { Tensor bias = *param.Bias(); Tensor new_bias = *param.NewBias(); Tensor new_scale = *param.NewScale(); - auto new_bias_ptr = new_bias.data(); - auto new_scale_ptr = new_scale.data(); int axis = param.Axis(); Tensor *output = param.Output(); math::expand_bias(bias, axis, output->dims()); @@ -106,20 +104,10 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0)); - } - } - /// todo : use neon in special case instead of 2for(300ms) - auto output_ptr = output->data(); - for (int c = 0; c < output_matrix_shape[0]; c++) { - int start = c * output_matrix_shape[1]; - for (int j = 0; j < output_matrix_shape[1]; j++) { - output_ptr[start + j] = - output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c]; - output_ptr[start + j] = - output_ptr[start + j] < 0 ? 0 : output_ptr[start + j]; + + math::matmulWithBn( + filter_slice, false, col_matrix, false, static_cast(1), + &out_slice, static_cast(0), true, &new_scale, &new_bias); } } } @@ -138,9 +126,12 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam ¶m) { param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), - param.Output(), param.NewScale(), - param.NewBias(), 1); + // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + // param.Output(), param.NewScale(), + // param.NewBias(), 1); + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); } else { ConvAddBNReluBasic(param); } diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h index 885f2051f645546c2585caa72aa9c80f8d352e6c..60b09df597f218eefc7f95ba3f342ae0c51c7000 100644 --- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h @@ -37,8 +37,12 @@ void DepthwiseConvCompute(const ConvParam ¶m) { param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), - param.Filter(), &Bias, param.Output(), false); + // math::DepthwiseConv3x3(param.Input(), param.Strides(), + // param.Paddings(), + // param.Filter(), &Bias, param.Output(), false); + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(), + Bias, false); + } else { ConvBasic(param); } diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index d1faa1fd641ffd771b5970165d51794a42655670..c8a6473567e6572d506aa5339f8e647c5c25fd5d 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -1010,6 +1010,442 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, output_data += output_batch_stride; } } + +void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, + Tensor *output, Tensor bias, bool if_bias) { + const float *input_data = input->data(); + const float *filter_data = filter->data(); + float *output_data = output->data(); + const float *bias_data = bias.data(); + + const int in_h = static_cast(input->dims()[2]); + const int in_w = static_cast(input->dims()[3]); + const int out_h = static_cast(output->dims()[2]); + const int out_w = static_cast(output->dims()[3]); + const int out_l = out_h; + const int in_l = in_h; + const int inhxw = in_h * in_w; + const int outhxw = out_h * out_w; + const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; + const int batch_size = static_cast(input->dims()[0]); + const int c = static_cast(input->dims()[1]); + const float *input_row_ptr; + float *output_row_ptr; + + const int w_times = (out_w - 2) / 3; + + float32x4_t vbias = vdupq_n_f32(0.0); + + float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1]; + float32x4_t elewise_res0, elewise_res1, elewise_res2, res3; + int out2in_mid; + float32x4_t zero = vdupq_n_f32(0.0); + for (int b = batch_size; b > 0; --b) { + const float *filter_data_tmp = filter_data; + for (int j = 0; j < c; ++j) { + auto output_data_tmp = output_data + j * out_h * out_w; + auto input_data_tmp = input_data + j * in_h * in_w; + auto input_const = input_data_tmp; + + if (if_bias) { + vbias = vdupq_n_f32(bias_data[j]); + } + + float w00 = filter_data_tmp[0]; + float w01 = filter_data_tmp[1]; + float w02 = filter_data_tmp[2]; + float w10 = filter_data_tmp[3]; + float w11 = filter_data_tmp[4]; + float w12 = filter_data_tmp[5]; + float w20 = filter_data_tmp[6]; + float w21 = filter_data_tmp[7]; + float w22 = filter_data_tmp[8]; + + int h_mid = 0; + + for (; h_mid < out_h - 1; h_mid++) { + input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w; + output_row_ptr = output_data_tmp + 1 + h_mid * out_w; + + for (int w4 = 0; w4 < w_times + 1; w4++) { + if (h_mid == 0) { + elewise_res1 = zero; + elewise_res0 = zero; + elewise_res2 = zero; + } else { + elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01); + elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00); + elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02); + } + input_buff_mid = vld2q_f32(input_row_ptr); + input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w); + + elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11); + elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10); + elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12); + + elewise_res1 = + vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21); + elewise_res0 = + vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20); + elewise_res2 = + vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22); + + res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), + vaddq_f32(elewise_res0, elewise_res1)); + res3 = vaddq_f32(res3, vbias); + vst1q_f32(output_row_ptr, res3); + + input_row_ptr += 6; + output_row_ptr += 3; + } + } + clock(); + + input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w; + output_row_ptr = output_data_tmp + 1 + h_mid * out_w; + + for (int w4 = 0; w4 < w_times + 1; w4++) { + elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01); + elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00); + elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02); + + input_buff_mid = vld2q_f32(input_row_ptr); + input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w); + + elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11); + elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10); + elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12); + + if (!if_pad) { + elewise_res1 = + vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21); + elewise_res0 = + vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20); + elewise_res2 = + vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22); + } + res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), + vaddq_f32(elewise_res0, elewise_res1)); + res3 = vaddq_f32(res3, vbias); + + if ((w4 != w_times)) { + vst1q_f32(output_row_ptr, res3); + } else { + if (out_l - 2 - w_times * 3 == 1) { + vst1q_lane_f32(output_row_ptr, res3, 0); + } else if (out_l - 2 - w_times * 3 == 2) { + vst1q_lane_f32(output_row_ptr, res3, 0); + vst1q_lane_f32(output_row_ptr + 1, res3, 1); + } + } + input_row_ptr += 6; + output_row_ptr += 3; + } + + output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + + input_const[in_l] * w21 + + input_const[in_l + 1] * w22; + + out2in_mid = (out_l - 1) * 2; + output_data_tmp[out_l - 1] = + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + + w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w] + + (1 - if_pad) * (w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + + out2in_mid = (out_l - 1) * 2 * in_w; + + output_data_tmp[out_l * (out_l - 1)] = + w01 * input_const[out2in_mid - in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]); + out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; + + output_data_tmp[out_l * out_l - 1] = + w00 * input_const[out2in_mid - in_w - 1] + + w01 * input_const[out2in_mid - in_w] + + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + + (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + if (if_bias) { + output_data_tmp[0] += bias_data[j]; + output_data_tmp[out_l - 1] += bias_data[j]; + output_data_tmp[out_l * (out_l - 1)] += bias_data[j]; + output_data_tmp[out_l * out_l - 1] += bias_data[j]; + } + for (int i = 1; i < out_h - 1; i++) { + out2in_mid = i * 2 * in_w; + output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w11 * input_const[out2in_mid] + + w12 * input_const[out2in_mid + 1] + + w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]; + + out2in_mid = i * 2 * in_w + (out_l - 1) * 2; + output_data_tmp[i * out_l + out_l - 1] = + w00 * input_const[out2in_mid - in_w - 1] + + w01 * input_const[out2in_mid - in_w] + + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + + w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w] + + (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + if (if_bias) { + output_data_tmp[i * out_l] += bias_data[j]; + output_data_tmp[i * out_l + out_l - 1] += bias_data[j]; + } + } + filter_data_tmp += 9; + } + input_data += inhxw * c; + output_data += outhxw * c; + } +} + +void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, + Tensor *output, const Tensor *new_scale, + const Tensor *new_bias, bool if_relu) { + const float *input_data = input->data(); + const float *filter_data = filter->data(); + float *output_data = output->data(); + const float *newscale_data = new_scale->data(); + const float *newbias_data = new_bias->data(); + + float32x4_t vnewbias = vdupq_n_f32(0.0); + float32x4_t vnewscale = vdupq_n_f32(1.0); + + const int in_h = static_cast(input->dims()[2]); + const int in_w = static_cast(input->dims()[3]); + const int out_h = static_cast(output->dims()[2]); + const int out_w = static_cast(output->dims()[3]); + const int out_l = out_h; + const int in_l = in_h; + const int inhxw = in_h * in_w; + const int outhxw = out_h * out_w; + const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; + const int batch_size = static_cast(input->dims()[0]); + const int c = static_cast(input->dims()[1]); + const float *input_row_ptr; + float *output_row_ptr; + + const int w_times = (out_w - 2) / 3; + + float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1]; + float32x4_t elewise_res0, elewise_res1, elewise_res2, res3; + int out2in_mid; + float32x4_t zero = vdupq_n_f32(0.0); + for (int b = batch_size; b > 0; --b) { + const float *filter_data_tmp = filter_data; + for (int j = 0; j < c; ++j) { + auto output_data_tmp = output_data + j * out_h * out_w; + auto input_data_tmp = input_data + j * in_h * in_w; + auto input_const = input_data_tmp; + + vnewbias = vdupq_n_f32(newbias_data[j]); + vnewscale = vdupq_n_f32(newscale_data[j]); + + float w00 = filter_data_tmp[0]; + float w01 = filter_data_tmp[1]; + float w02 = filter_data_tmp[2]; + float w10 = filter_data_tmp[3]; + float w11 = filter_data_tmp[4]; + float w12 = filter_data_tmp[5]; + float w20 = filter_data_tmp[6]; + float w21 = filter_data_tmp[7]; + float w22 = filter_data_tmp[8]; + + int h_mid = 0; + + for (; h_mid < out_h - 1; h_mid++) { + input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w; + output_row_ptr = output_data_tmp + 1 + h_mid * out_w; + + for (int w4 = 0; w4 < w_times + 1; w4++) { + if (h_mid == 0) { + elewise_res1 = zero; + elewise_res0 = zero; + elewise_res2 = zero; + } else { + elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01); + elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00); + elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02); + } + input_buff_mid = vld2q_f32(input_row_ptr); + input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w); + + elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11); + elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10); + elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12); + + elewise_res1 = + vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21); + elewise_res0 = + vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20); + elewise_res2 = + vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22); + + res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), + vaddq_f32(elewise_res0, elewise_res1)); + res3 = vmlaq_f32(vnewbias, vnewscale, res3); + + if (if_relu) { + res3 = vmaxq_f32(res3, zero); + } + vst1q_f32(output_row_ptr, res3); + + input_row_ptr += 6; + output_row_ptr += 3; + } + } + clock(); + + input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w; + output_row_ptr = output_data_tmp + 1 + h_mid * out_w; + + for (int w4 = 0; w4 < w_times + 1; w4++) { + elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01); + elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00); + elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02); + + input_buff_mid = vld2q_f32(input_row_ptr); + input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w); + + elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11); + elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10); + elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12); + + if (!if_pad) { + elewise_res1 = + vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21); + elewise_res0 = + vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20); + elewise_res2 = + vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22); + } + res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), + vaddq_f32(elewise_res0, elewise_res1)); + res3 = vmlaq_f32(vnewbias, vnewscale, res3); + + if (if_relu) { + res3 = vmaxq_f32(res3, zero); + } + if ((w4 != w_times)) { + vst1q_f32(output_row_ptr, res3); + } else { + if (out_l - 2 - w_times * 3 == 1) { + vst1q_lane_f32(output_row_ptr, res3, 0); + } else if (out_l - 2 - w_times * 3 == 2) { + vst1q_lane_f32(output_row_ptr, res3, 0); + vst1q_lane_f32(output_row_ptr + 1, res3, 1); + } + } + input_row_ptr += 6; + output_row_ptr += 3; + } + + output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + + input_const[in_l] * w21 + + input_const[in_l + 1] * w22; + + out2in_mid = (out_l - 1) * 2; + output_data_tmp[out_l - 1] = + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + + w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w] + + (1 - if_pad) * (w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + + out2in_mid = (out_l - 1) * 2 * in_w; + + output_data_tmp[out_l * (out_l - 1)] = + w01 * input_const[out2in_mid - in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]); + out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; + + output_data_tmp[out_l * out_l - 1] = + w00 * input_const[out2in_mid - in_w - 1] + + w01 * input_const[out2in_mid - in_w] + + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + + (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + output_data_tmp[0] = + output_data_tmp[0] * newscale_data[j] + newbias_data[j]; + output_data_tmp[out_l - 1] = + output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j]; + output_data_tmp[out_l * (out_l - 1)] = + output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] + + newbias_data[j]; + output_data_tmp[out_l * out_l - 1] = + output_data_tmp[out_l * out_l - 1] * newscale_data[j] + + newbias_data[j]; + if (if_relu) { + output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0]; + output_data_tmp[out_l - 1] = + output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1]; + output_data_tmp[out_l * (out_l - 1)] = + output_data_tmp[out_l * (out_l - 1)] < 0 + ? 0 + : output_data_tmp[out_l * (out_l - 1)]; + output_data_tmp[out_l * out_l - 1] = + output_data_tmp[out_l * out_l - 1] < 0 + ? 0 + : output_data_tmp[out_l * out_l - 1]; + } + for (int i = 1; i < out_h - 1; i++) { + out2in_mid = i * 2 * in_w; + output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + + w02 * input_const[out2in_mid - in_w + 1] + + w11 * input_const[out2in_mid] + + w12 * input_const[out2in_mid + 1] + + w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]; + + out2in_mid = i * 2 * in_w + (out_l - 1) * 2; + output_data_tmp[i * out_l + out_l - 1] = + w00 * input_const[out2in_mid - in_w - 1] + + w01 * input_const[out2in_mid - in_w] + + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + + w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w] + + (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + output_data_tmp[i * out_l] = + output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j]; + output_data_tmp[i * out_l + out_l - 1] = + output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] + + newbias_data[j]; + if (if_relu) { + output_data_tmp[i * out_l] = + output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l]; + output_data_tmp[i * out_l + out_l - 1] = + output_data_tmp[i * out_l + out_l - 1] < 0 + ? 0 + : output_data_tmp[i * out_l + out_l - 1]; + } + } + filter_data_tmp += 9; + } + input_data += inhxw * c; + output_data += outhxw * c; + } +} + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h index b5103a53ad7d988b8d18a57cb0f0d8d4bb9fee0f..60e979648f871e640924a3373c625c311c3dd067 100644 --- a/src/operators/math/depthwise_conv_3x3.h +++ b/src/operators/math/depthwise_conv_3x3.h @@ -38,6 +38,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu); +void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, + Tensor *output, Tensor bias, bool if_bias); +void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, + Tensor *output, const Tensor *new_scale, + const Tensor *new_bias, bool if_relu); } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm.cpp index aaf3c183f3e125f09695fad8a41cfb5360e9da13..8cb778c458034aecf6cea89fcf0d3e2a3d8118ba 100644 --- a/test/common/test_gemm.cpp +++ b/test/common/test_gemm.cpp @@ -52,8 +52,9 @@ int main() { } auto time1 = time(); - paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, - ldc); + // paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, + // c, + // ldc); auto time2 = time(); DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n"; for (int i = 0; i < m * n; ++i) {