diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h index 6e5b467c9285c9a752b201c253080990d413893d..d71bc235977236fbd0dd332df556ea4bd41eacf4 100644 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -124,8 +124,7 @@ void ConvAddCompute(const FusionConvAddParam ¶m) { } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { // math::DepthwiseConv3x3(param.Input(), param.Strides(), // param.Paddings(), // param.Filter(), param.Bias(), diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h index 06c63c4a8d62d886f25465048faf6c109df0eafd..a7d14fbad1e4b72a8571d13898e55a6cad8bf9a8 100644 --- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h @@ -118,16 +118,14 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam ¶m) { if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), param.Output(), param.NewScale(), param.NewBias(), true); } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // param.Output(), param.NewScale(), // param.NewBias(), 1); diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 39ef81bd15cf6313dfde2ac16e5c5d5303393b7d..e7a8c7f52db327f3ff5871566c3557c484ba4d13 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -130,8 +130,7 @@ void ConvCompute(const ConvParam ¶m) { } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3) { math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), param.Filter(), nullptr, param.Output(), false); } else { diff --git a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h index 186f77a4cee42fba9a80d11f20f2f6fa6e2132eb..7c31eed19693d20084e25daa485a0553d5d795f2 100644 --- a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h @@ -122,16 +122,14 @@ void ConvBNAddReluCompute(const FusionConvBNAddReluParam ¶m) { if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), param.Output(), param.NewScale(), param.NewBias(), true); } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // param.Output(), param.NewScale(), // param.NewBias(), 1); diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h index 27fe0a8a014ff11f96017cad3acc7557cbde5583..c6300f96e1b999c45538417c7b513068697ad4dd 100644 --- a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h @@ -117,16 +117,14 @@ void ConvBNReluCompute(const FusionConvBNReluParam ¶m) { if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), param.Output(), param.NewScale(), param.NewBias(), true); } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // param.Output(), param.NewScale(), // param.NewBias(), 1); diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h index d039786d1febe4a8c63df98f1732ae0f9de98474..73170bdab922a46831334307aebc8af210ddfb73 100644 --- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h @@ -36,8 +36,7 @@ void DepthwiseConvCompute(const ConvParam ¶m) { } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { // math::DepthwiseConv3x3(param.Input(), param.Strides(), // param.Paddings(), // param.Filter(), &Bias, param.Output(), false); diff --git a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h index a9b2668b7be4ceb8717621b14aff4e58c81053de..b60bf9b4d6df9d85cc2fbe378a3904c2d13e5e60 100644 --- a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h +++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h @@ -115,16 +115,14 @@ void DWConvBNReluCompute(const FusionDWConvBNReluParam ¶m) { if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), param.Output(), param.NewScale(), param.NewBias(), true); } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && - param.Input()->dims()[2] == param.Input()->dims()[3]) { + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // param.Output(), param.NewScale(), // param.NewBias(), 1); diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index adaa6d2d9002892c7f563c1bee257a62a68592fb..b222e5a08a69701ab9ab2c6d91e89a2d1fa63454 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -302,7 +302,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, for (int i = 1; i < h - 1; ++i) { output_data[i * w] = w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + - w11 * input_data[i * w] + w12 * input_data[i * w + w] + + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + @@ -537,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, const int hxw = input_height * input_width; - const int l = input_height; - + // const int l = input_height; + const int h = input_height; + const int w = input_width; float32x4_t vzero = vdupq_n_f32(0); for (int b = 0; b < batch_size; b++) { @@ -624,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } output_data[0] = w11 * input_data[0] + w12 * input_data[1] + - w21 * input_data[l] + w22 * input_data[l + 1]; - output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] + - w20 * input_data[2 * l - 2] + - w21 * input_data[2 * l - 1]; - output_data[(l - 1) * l] = - w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] + - w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; - output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + - w01 * input_data[(l - 2) * (l + 1) + 1] + - w10 * input_data[l * l - 2] + - w11 * input_data[l * l - 1]; + w21 * input_data[w] + w22 * input_data[w + 1]; + output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] + + w20 * input_data[2 * w - 2] + + w21 * input_data[2 * w - 1]; + output_data[(h - 1) * w] = + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] + + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; + output_data[h * w - 1] = + w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] + + w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1]; output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c]; - output_data[l - 1] = - output_data[l - 1] * newscale_data[c] + newbias_data[c]; - output_data[(l - 1) * l] = - output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c]; - output_data[l * l - 1] = - output_data[l * l - 1] * newscale_data[c] + newbias_data[c]; + output_data[w - 1] = + output_data[w - 1] * newscale_data[c] + newbias_data[c]; + output_data[(h - 1) * w] = + output_data[(h - 1) * w] * newscale_data[c] + newbias_data[c]; + output_data[h * w - 1] = + output_data[h * w - 1] * newscale_data[c] + newbias_data[c]; if (if_relu) { output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; - output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1]; - output_data[(l - 1) * l] = - output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l]; - output_data[l * l - 1] = - output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1]; + output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - 1]; + output_data[(h - 1) * w] = + output_data[(h - 1) * w] < 0 ? 0 : output_data[(h - 1) * w]; + output_data[h * w - 1] = + output_data[h * w - 1] < 0 ? 0 : output_data[h * w - 1]; } - for (int i = 1; i < l - 1; ++i) { - output_data[i * l] = - w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] + - w11 * input_data[i * l] + w12 * input_data[i * l + 1] + - w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1]; - - output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + - w01 * input_data[i * l + l - 1 - l] + - w10 * input_data[i * l + l - 1 - 1] + - w11 * input_data[i * l + l - 1] + - w20 * input_data[i * l + l - 1 + l - 1] + - w21 * input_data[i * l + l - 1 + l]; - output_data[i * l] = - output_data[i * l] * newscale_data[c] + newbias_data[c]; - output_data[i * l + l - 1] = - output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c]; + for (int i = 1; i < h - 1; ++i) { + output_data[i * w] = + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + + w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; + + output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + + w01 * input_data[i * w + w - 1 - w] + + w10 * input_data[i * w + w - 1 - 1] + + w11 * input_data[i * w + w - 1] + + w20 * input_data[i * w + w - 1 + w - 1] + + w21 * input_data[i * w + w - 1 + w]; + output_data[i * w] = + output_data[i * w] * newscale_data[c] + newbias_data[c]; + output_data[i * w + w - 1] = + output_data[i * w + w - 1] * newscale_data[c] + newbias_data[c]; if (if_relu) { - output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l]; - output_data[i * l + l - 1] = - output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1]; + output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i * w]; + output_data[i * w + w - 1] = + output_data[i * w + w - 1] < 0 ? 0 : output_data[i * w + w - 1]; } } @@ -774,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, const int h = static_cast(input->dims()[2]); const int w = static_cast(input->dims()[3]); - const int l = h; +// const int l = h; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); @@ -790,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, vnewbias = vdupq_n_f32(newbias_data[j]); vnewscale = vdupq_n_f32(newscale_data[j]); - int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 + int w_mid = w - 2; // l=1->l_mid=-1,l=2->l_mid=0 float w00 = filter_data_tmp[0]; float w01 = filter_data_tmp[1]; float w02 = filter_data_tmp[2]; @@ -802,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, float w22 = filter_data_tmp[8]; output_data[0] = w11 * input_data[0] + w12 * input_data[1] + - w21 * input_data[l] + w22 * input_data[l + 1]; - - output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - - 1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1]; - - output_data[(l - 1) * l] = - w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + - 1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; - output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + - w01 * input_data[(l - 2) * (l + 1) + 1] + - w10 * input_data[l * l - 2] + - w11 * input_data[l * l - 1]; + w21 * input_data[w] + w22 * input_data[w + 1]; + + output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - + 1] + w20 * input_data[2 * w - 2] + w21 * input_data[2 * w - 1]; + + output_data[(h - 1) * w] = + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + + 1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; + output_data[h * w - 1] = w00 * input_data[h*w-w-2] + + w01 * input_data[h*w-w-1] + + w10 * input_data[h * w - 2] + + w11 * input_data[h * w - 1]; output_data[0] = output_data[0] * newscale_data[j] + - newbias_data[j]; output_data[l - 1] = output_data[l - 1] * - newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] = - output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j]; - output_data[l * l - 1] = - output_data[l * l - 1] * newscale_data[j] + newbias_data[j]; + newbias_data[j]; output_data[w - 1] = output_data[w - 1] * + newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] = + output_data[(h - 1) * w] * newscale_data[j] + newbias_data[j]; + output_data[h * w - 1] = + output_data[h * w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; - output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - - 1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 : - output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1] - < 0 ? 0 : output_data[l * l - 1]; + output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - + 1]; output_data[(h - 1) * w] = output_data[(h - 1) * w] < 0 ? 0 : + output_data[(h - 1) * w]; output_data[h * w - 1] = output_data[h * w - 1] + < 0 ? 0 : output_data[h * w - 1]; } - for (int i = 1; i < l - 1; ++i) { - output_data[i * l] = - w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] - + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 * - input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i * - l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i - * l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 * - input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21 - * input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l] - * newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] = - output_data[i * l + l - 1] * newscale_data[j] + + for (int i = 1; i < h - 1; ++i) { + output_data[i * w] = + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 * + input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i * + w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + w01 * input_data[i + * w + w - 1 - w] + w10 * input_data[i * w + w - 1 - 1] + w11 * + input_data[i * w + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] + w21 + * input_data[i * w + w - 1 + w]; output_data[i * w] = output_data[i * w] + * newscale_data[j] + newbias_data[j]; output_data[i * w + w - 1] = + output_data[i * w + w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { - output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i - * l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 : - output_data[i * l + l - 1]; + output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i + * w]; output_data[i * w + w - 1] = output_data[i * w + w - 1] < 0 ? 0 : + output_data[i * w + w - 1]; } } @@ -853,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 = - vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l - - 2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end + - l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid > + vld1q_f32(input_tmp + w); const float *input_tmp_end = input_tmp + (h - + 2) * w; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end + + w); int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid > 3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 = - vld1q_f32(input_tmp + l + 4); + vld1q_f32(input_tmp + w + 4); tmp0 = vextq_f32(in0, in1, 1); tmp1 = vextq_f32(in0, in1, 2); @@ -878,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, vst1q_f32(output_ptr, out0); in5 = vld1q_f32(input_tmp_end + 4); - in7 = vld1q_f32(input_tmp_end + l + 4); + in7 = vld1q_f32(input_tmp_end + w + 4); tmp0 = vextq_f32(in4, in5, 1); tmp1 = vextq_f32(in4, in5, 2); @@ -895,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, if (if_relu) { out0 = vmaxq_f32(out0, vzero); } - vst1q_f32(output_ptr + (l - 1) * l, out0); + vst1q_f32(output_ptr + (h - 1) * w, out0); // can optimize to each 8 stride. input_tmp += 4; @@ -908,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } // top right pad - float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]); - float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]); + float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]); tmp0 = vextq_f32(in0, pad0, 1); tmp1 = vextq_f32(in0, pad0, 2); @@ -939,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } // bottom right pad - float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]); - float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]); + float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]); + float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]); tmp0 = vextq_f32(in4, pad2, 1); tmp1 = vextq_f32(in4, pad2, 2); @@ -959,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } for (int i = 0; i < c_mid; ++i) { if (i == 0) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0); } if (i == 1) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1); } if (i == 2) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2); } } // mid - for (int i = 0; i < l - 2; ++i) { - auto output_ptr = output_data + (i + 1) * l + 1; - input_tmp = input_data + i * l; + for (int i = 0; i < h - 2; ++i) { + auto output_ptr = output_data + (i + 1) * w + 1; + input_tmp = input_data + i * w; auto in0_tmp = vld1q_f32(input_tmp); - auto in2_tmp = vld1q_f32(input_tmp + l); - auto in4_tmp = vld1q_f32(input_tmp + l + l); - c_mid = l_mid; + auto in2_tmp = vld1q_f32(input_tmp + w); + auto in4_tmp = vld1q_f32(input_tmp + w + w); + c_mid = w_mid; for (; c_mid > 3; c_mid -= 4) { auto in1_tmp = vld1q_f32(input_tmp + 4); - auto in3_tmp = vld1q_f32(input_tmp + l + 4); - auto in5_tmp = vld1q_f32(input_tmp + l + l + 4); + auto in3_tmp = vld1q_f32(input_tmp + w + 4); + auto in5_tmp = vld1q_f32(input_tmp + w + w + 4); tmp0 = vextq_f32(in0_tmp, in1_tmp, 1); tmp1 = vextq_f32(in0_tmp, in1_tmp, 2); @@ -1012,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, in4_tmp = in5_tmp; } - float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]); - float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]); - float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]); + float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]); + float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]); tmp0 = vextq_f32(in0_tmp, pad0, 1); tmp1 = vextq_f32(in0_tmp, pad0, 2); @@ -1058,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, #endif } +/// w!=h not fix void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { @@ -1273,7 +1274,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, const int in_l = in_h; const int inhxw = in_h * in_w; const int outhxw = out_h * out_w; - const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; + /// todo : fix if_pad when w != h + const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); const float *input_row_ptr; @@ -1379,9 +1381,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, if ((w4 != w_times)) { vst1q_f32(output_row_ptr, res3); } else { - if (out_l - 2 - w_times * 3 == 1) { + if (out_w - 2 - w_times * 3 == 1) { vst1q_lane_f32(output_row_ptr, res3, 0); - } else if (out_l - 2 - w_times * 3 == 2) { + } else if (out_w - 2 - w_times * 3 == 2) { vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr + 1, res3, 1); } @@ -1391,28 +1393,28 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, } output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + - input_const[in_l] * w21 + - input_const[in_l + 1] * w22; + input_const[in_w] * w21 + + input_const[in_w + 1] * w22; - out2in_mid = (out_l - 1) * 2; - output_data_tmp[out_l - 1] = + out2in_mid = (out_h - 1) * 2; + output_data_tmp[out_w - 1] = w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + (1 - if_pad) * (w12 * input_const[out2in_mid + 1] + w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w; + out2in_mid = (out_h - 1) * 2 * in_w; - output_data_tmp[out_l * (out_l - 1)] = + output_data_tmp[out_w * (out_h - 1)] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; + out2in_mid = (out_h - 1) * 2 * in_w + (out_h - 1) * 2; - output_data_tmp[out_l * out_l - 1] = + output_data_tmp[out_h * out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + @@ -1423,21 +1425,21 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, w22 * input_const[out2in_mid + in_w + 1]); if (if_bias) { output_data_tmp[0] += bias_data[j]; - output_data_tmp[out_l - 1] += bias_data[j]; - output_data_tmp[out_l * (out_l - 1)] += bias_data[j]; - output_data_tmp[out_l * out_l - 1] += bias_data[j]; + output_data_tmp[out_w - 1] += bias_data[j]; + output_data_tmp[out_w * (out_h - 1)] += bias_data[j]; + output_data_tmp[out_h * out_w - 1] += bias_data[j]; } for (int i = 1; i < out_h - 1; i++) { out2in_mid = i * 2 * in_w; - output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + + output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w21 * input_const[out2in_mid + in_w] + w22 * input_const[out2in_mid + in_w + 1]; - out2in_mid = i * 2 * in_w + (out_l - 1) * 2; - output_data_tmp[i * out_l + out_l - 1] = + out2in_mid = i * 2 * in_w + (out_h - 1) * 2; + output_data_tmp[i * out_w + out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + @@ -1447,8 +1449,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, w12 * input_const[out2in_mid + 1] + w22 * input_const[out2in_mid + in_w + 1]); if (if_bias) { - output_data_tmp[i * out_l] += bias_data[j]; - output_data_tmp[i * out_l + out_l - 1] += bias_data[j]; + output_data_tmp[i * out_w] += bias_data[j]; + output_data_tmp[i * out_w + out_w - 1] += bias_data[j]; } } filter_data_tmp += 9; @@ -1655,11 +1657,12 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, const int in_w = static_cast(input->dims()[3]); const int out_h = static_cast(output->dims()[2]); const int out_w = static_cast(output->dims()[3]); - const int out_l = out_h; - const int in_l = in_h; + // const int out_l = out_h; + // const int in_l = in_h; const int inhxw = in_h * in_w; const int outhxw = out_h * out_w; - const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; + /// todo : fix if_pad when w != h + const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); const int w_times = (out_w - 2) / 3; @@ -1773,9 +1776,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, vst1q_lane_f32(output_row_ptr + 1, res3, 1); vst1q_lane_f32(output_row_ptr + 2, res3, 2); } else { - if (out_l - 2 - w_times * 3 == 1) { + if (out_w - 2 - w_times * 3 == 1) { vst1q_lane_f32(output_row_ptr, res3, 0); - } else if (out_l - 2 - w_times * 3 == 2) { + } else if (out_w - 2 - w_times * 3 == 2) { vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr + 1, res3, 1); } @@ -1785,28 +1788,28 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, } output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + - input_const[in_l] * w21 + - input_const[in_l + 1] * w22; + input_const[in_w] * w21 + + input_const[in_w + 1] * w22; - out2in_mid = (out_l - 1) * 2; - output_data_tmp[out_l - 1] = + out2in_mid = (out_h - 1) * 2; + output_data_tmp[out_w - 1] = w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + (1 - if_pad) * (w12 * input_const[out2in_mid + 1] + w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w; + out2in_mid = (out_h - 1) * 2 * in_w; - output_data_tmp[out_l * (out_l - 1)] = + output_data_tmp[out_w * (out_h - 1)] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; + out2in_mid = (out_h - 1) * 2 * in_w + (out_h - 1) * 2; - output_data_tmp[out_l * out_l - 1] = + output_data_tmp[out_h * out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + @@ -1817,38 +1820,38 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, w22 * input_const[out2in_mid + in_w + 1]); output_data_tmp[0] = output_data_tmp[0] * newscale_data[j] + newbias_data[j]; - output_data_tmp[out_l - 1] = - output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j]; - output_data_tmp[out_l * (out_l - 1)] = - output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] + + output_data_tmp[out_w - 1] = + output_data_tmp[out_w - 1] * newscale_data[j] + newbias_data[j]; + output_data_tmp[out_w * (out_h - 1)] = + output_data_tmp[out_w * (out_h - 1)] * newscale_data[j] + newbias_data[j]; - output_data_tmp[out_l * out_l - 1] = - output_data_tmp[out_l * out_l - 1] * newscale_data[j] + + output_data_tmp[out_h * out_w - 1] = + output_data_tmp[out_h * out_w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0]; - output_data_tmp[out_l - 1] = - output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1]; - output_data_tmp[out_l * (out_l - 1)] = - output_data_tmp[out_l * (out_l - 1)] < 0 + output_data_tmp[out_w - 1] = + output_data_tmp[out_w - 1] < 0 ? 0 : output_data_tmp[out_w - 1]; + output_data_tmp[out_w * (out_h - 1)] = + output_data_tmp[out_w * (out_h - 1)] < 0 ? 0 - : output_data_tmp[out_l * (out_l - 1)]; - output_data_tmp[out_l * out_l - 1] = - output_data_tmp[out_l * out_l - 1] < 0 + : output_data_tmp[out_w * (out_h - 1)]; + output_data_tmp[out_h * out_w - 1] = + output_data_tmp[out_h * out_w - 1] < 0 ? 0 - : output_data_tmp[out_l * out_l - 1]; + : output_data_tmp[out_h * out_w - 1]; } for (int i = 1; i < out_h - 1; i++) { out2in_mid = i * 2 * in_w; - output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + + output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w21 * input_const[out2in_mid + in_w] + w22 * input_const[out2in_mid + in_w + 1]; - out2in_mid = i * 2 * in_w + (out_l - 1) * 2; - output_data_tmp[i * out_l + out_l - 1] = + out2in_mid = i * 2 * in_w + (out_h - 1) * 2; + output_data_tmp[i * out_w + out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + @@ -1857,18 +1860,18 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] + w12 * input_const[out2in_mid + 1] + w22 * input_const[out2in_mid + in_w + 1]); - output_data_tmp[i * out_l] = - output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j]; - output_data_tmp[i * out_l + out_l - 1] = - output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] + + output_data_tmp[i * out_w] = + output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j]; + output_data_tmp[i * out_w + out_w - 1] = + output_data_tmp[i * out_w + out_w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { - output_data_tmp[i * out_l] = - output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l]; - output_data_tmp[i * out_l + out_l - 1] = - output_data_tmp[i * out_l + out_l - 1] < 0 + output_data_tmp[i * out_w] = + output_data_tmp[i * out_w] < 0 ? 0 : output_data_tmp[i * out_w]; + output_data_tmp[i * out_w + out_w - 1] = + output_data_tmp[i * out_w + out_w - 1] < 0 ? 0 - : output_data_tmp[i * out_l + out_l - 1]; + : output_data_tmp[i * out_w + out_w - 1]; } } }