diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index bf88bdd3b0cbace1161477aec29a3244c01dd673..2bca4b25a12534e1323bd48f8bbfdf5d41a2c2b5 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -166,7 +166,8 @@ void ConvCompute(const ConvParam ¶m) { param.Strides()[0] == param.Strides()[1] && param.Dilations()[0] == param.Dilations()[1] && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && - param.Dilations()[0] == 1 && param.Input()->dims()[1] >= 16) { + param.Dilations()[0] == 1 && param.Output()->dims()[1] >= 16 && + param.Output()->dims()[2] >= 16) { BatchConv3x3Winograd(param); } else { ConvBasic(param); diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index ecbf9de206243b1e273617addfc5ab5acb40c4a6..8d460688bbedf3d2a4e5dadaa5eebb1ca709cf05 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -41,48 +41,48 @@ void ExtractToImg(const float *im_data, float *col_data, const int im_height, im_data += start_height * im_width + start_width; col_data += col_start_height * col_width + col_start_width; - #pragma omp parallel for + for (int i = start_height; i < end_height; i += stride_h) { - const float *local_im_data = im_data + i * im_width * stride_h; - float *local_col_data = col_data + col_width; if (stride_w == 1) { - memcpy(local_col_data, local_im_data, extract * sizeof(float)); + memcpy(col_data, im_data, extract * sizeof(float)); } else if (stride_w == 2) { int s = 0; #if __ARM_NEON - for (; s < extract - 15; s += 16) { - float32x4x2_t img = vld2q_f32(local_im_data + s * 2); - vst1q_f32(local_col_data + s, img.val[0]); + for (; s < extract - 3; s += 4) { + float32x4x2_t img = vld2q_f32(im_data + s * 2); + vst1q_f32(col_data + s, img.val[0]); } #endif for (; s < extract; ++s) { - local_col_data[s] = local_im_data[s * 2]; + col_data[s] = im_data[s * 2]; } } else if (stride_w == 3) { int s = 0; #if __ARM_NEON - for (; s < extract - 15; s += 16) { - float32x4x3_t img = vld3q_f32(local_im_data + s * 3); - vst1q_f32(local_col_data + s, img.val[0]); + for (; s < extract - 3; s += 4) { + float32x4x3_t img = vld3q_f32(im_data + s * 3); + vst1q_f32(col_data + s, img.val[0]); } #endif for (; s < extract; ++s) { - local_col_data[s] = local_im_data[s * 3]; + col_data[s] = im_data[s * 3]; } } else if (stride_w == 4) { int s = 0; #if __ARM_NEON - for (; s < extract - 15; s += 16) { - float32x4x4_t img = vld4q_f32(local_im_data + s * 4); - vst1q_f32(local_col_data + s, img.val[0]); + for (; s < extract - 3; s += 4) { + float32x4x4_t img = vld4q_f32(im_data + s * 4); + vst1q_f32(col_data + s, img.val[0]); } #endif for (; s < extract; ++s) { - local_col_data[s] = local_im_data[s * 4]; + col_data[s] = im_data[s * 4]; } } else { PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4."); } + im_data += im_width * stride_h; + col_data += col_width; } } @@ -428,18 +428,23 @@ void Im2ColFunctor::operator()( im_data += isize * isize; } } else if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) { + int im_spatial_size = im_height * im_width; + int col_spatial_size = col_height * col_width; // pad 0 memset(col_data, 0, col->numel() * sizeof(float)); + #pragma omp parallel for for (int ic = 0; ic < im_channels; ++ic) { + const float *local_im_data = im_data + ic * im_spatial_size; + float *local_col_data = + col_data + ic * filter_height * filter_width * col_spatial_size; for (int kh = 0; kh < filter_height; ++kh) { for (int kw = 0; kw < filter_width; ++kw) { - ExtractToImg(im_data, col_data, im_height, im_width, col_height, - col_width, padding[0], padding[1], stride[0], stride[1], - kh, kw); - col_data += col_height * col_width; + ExtractToImg(local_im_data, local_col_data, im_height, im_width, + col_height, col_width, padding[0], padding[1], stride[0], + stride[1], kh, kw); + local_col_data += col_spatial_size; } } - im_data += im_height * im_width; } } else { #endif @@ -553,18 +558,23 @@ void Im2ColFunctor::operator()( int8_t *col_data = col->mutable_data(); #if defined(__ARM_NEON__) || defined(__ARM_NEON) if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) { + int im_spatial_size = im_height * im_width; + int col_spatial_size = col_height * col_width; // pad 0 memset(col_data, 0, col->numel() * sizeof(int8_t)); + #pragma omp parallel for for (int ic = 0; ic < im_channels; ++ic) { + const int8_t *local_im_data = im_data + ic * im_spatial_size; + int8_t *local_col_data = + col_data + ic * filter_height * filter_width * col_spatial_size; for (int kh = 0; kh < filter_height; ++kh) { for (int kw = 0; kw < filter_width; ++kw) { - ExtractToImg(im_data, col_data, im_height, im_width, col_height, - col_width, padding[0], padding[1], stride[0], stride[1], - kh, kw); - col_data += col_height * col_width; + ExtractToImg(local_im_data, local_col_data, im_height, im_width, + col_height, col_width, padding[0], padding[1], stride[0], + stride[1], kh, kw); + local_col_data += col_spatial_size; } } - im_data += im_height * im_width; } } else { #endif diff --git a/src/operators/math/winograd/winograd.cpp b/src/operators/math/winograd/winograd.cpp index 06f241dd57e6c1393e77b9328e47d47e69da862f..c8640e3b3c1c52092c0cab703177ceb8519292ce 100644 --- a/src/operators/math/winograd/winograd.cpp +++ b/src/operators/math/winograd/winograd.cpp @@ -30,6 +30,9 @@ void winograd_f6k3(const framework::Tensor &input, const framework::Tensor &weight, framework::Tensor *output) { framework::Tensor transformed_input; framework::Tensor transformed_weight; +#if __aarch64__ + // TODO(hjchen2) +#else // transform weight winograd_transform_weight<8, 3>(weight, &transformed_weight); // tile input and transform @@ -37,6 +40,7 @@ void winograd_f6k3(const framework::Tensor &input, // caculate output winograd_transform_output<8, 3>(transformed_input, transformed_weight, output); +#endif } // F(4X4, 5X5) diff --git a/src/operators/math/winograd/winograd_transform_f6k3.cpp b/src/operators/math/winograd/winograd_transform_f6k3.cpp index e2ea8b03d51e76a740775f5885ef1a299159c724..8879a0e4e934e17391601631349542d3710edea1 100644 --- a/src/operators/math/winograd/winograd_transform_f6k3.cpp +++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// Inspired by https://arxiv.org/abs/1509.09308 and -// https://github.com/andravin/wincnn and refered from nnpack and ncnn project +// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn +// project. + +#ifdef CONV_OP + +#ifndef __aarch64__ #include "operators/math/pad.h" #include "operators/math/winograd/winograd_transform.h" @@ -47,12 +51,13 @@ void winograd_transform_weight<8, 3>(const framework::Tensor &weight, const float transform_matrix[8] = {2.f, -2.f / 9, 1.f / 90, 1.f / 180}; const float *inptr = weight.data(); int remain_start = out_channel & 0xFFFC; -#ifdef __aarch64__ +#if 0 remain_start = 0; #else + #pragma omp parallel for for (int oc = 0; oc < out_channel - 3; oc += 4) { - float gw[96]; // gw[3][8][4] - const float *inptr0 = inptr + oc * in_channel * 9; // + float gw[96]; // gw[3][8][4] + const float *inptr0 = inptr + oc * in_channel * 9; const float *inptr1 = inptr + (oc + 1) * in_channel * 9; const float *inptr2 = inptr + (oc + 2) * in_channel * 9; const float *inptr3 = inptr + (oc + 3) * in_channel * 9; @@ -252,9 +257,10 @@ void winograd_transform_weight<8, 3>(const framework::Tensor &weight, "q13", "r0"); } } -#endif // __aarch64__ +#endif // remain output channel + #pragma omp parallel for for (int oc = remain_start; oc < out_channel; ++oc) { float gw[3][8]; // gw[3][8] const float *inptr0 = inptr + oc * in_channel * 9; // @@ -301,10 +307,6 @@ void winograd_transform_weight<8, 3>(const framework::Tensor &weight, outptr += 4; } } - - // for (int i = 0; i < output->numel(); ++i) { - // DLOG << "TransK[" << i << "] = " << trans_outptr[i]; - // } } template <> @@ -657,6 +659,7 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, #endif // remainer channels + #pragma omp parallel for for (int c = remain_c_start; c < channel; ++c) { const float *in = inptr + c * image_size; float d_bt[64]; // d * B_t @@ -867,15 +870,6 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input, } } } - - // for (int c = 0; c < channel; ++c) { - // for (int tile = 0; tile < output->numel()/channel/64; ++tile) { - // for (int i = 0; i < 64; ++i) { - // int offset = (((tile / 8) * 64 + i) * channel + c) * 8 + (tile % 8); - // DLOG << "TransInput[" << i << "] = " << outptr[offset]; - // } - // } - // } } template <> @@ -897,6 +891,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, const float *input_ptr = input.data(); const float *weight_ptr = weight.data(); + #pragma omp parallel for for (int i = 0; i < out_channel; ++i) { float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32); for (int k = 0; k < 64; ++k) { @@ -1017,15 +1012,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, } } - // for (int c = 0; c < 4 * out_channel; ++c) { - // for (int tile = 0; tile < 8 * tiles; ++tile) { - // for (int i = 0; i < 64; ++i) { - // int offset = (c * 8 * tiles + tile) * 64 + i; - // DLOG << "uv_trans[" << i << "] = " << uv_trans_ptr[offset]; - // } - // } - // } - /* * s0 = m0 + (m1 + m2) + (m3 + m4) + 32 * (m5 + m6) * s1 = (m1 - m2) + 2 * (m3 - m4) + 16 * (m5 - m6) @@ -1045,12 +1031,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, int uv_image_size = uv_trans.dims()[1] * 64; float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f}; - // DLOG << "out_channel: " << out_channel; - // DLOG << "h_tiles: " << h_tiles; - // DLOG << "w_tiles: " << w_tiles; - // DLOG << "remain_h: " << remain_h; - // DLOG << "remain_w: " << remain_w; - + #pragma omp parallel for for (int oc = 0; oc < out_channel; ++oc) { float at_m[48]; // [6][8] float output_tmp[36]; // [6][6], temporarily restore results @@ -1118,9 +1099,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, : [tm_ptr] "r"((float *)transform_matrix) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); - // for (int i = 0; i < 48; ++i) { - // DLOG << "at_m[" << i << "] = " << at_m[i]; - // } float *at_m_ptr0 = at_m; float *at_m_ptr1 = at_m + 24; @@ -1252,9 +1230,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, float *out_ptr = output_ptr + offset; int remain_row = (tile_h < h_tiles - 1) ? 6 : remain_h; int remain_col = (tile_w < w_tiles - 1) ? 6 : remain_w; - // for (int i = 0; i < 36; ++i) { - // DLOG << "output_tmp[" << i << "] = " << output_tmp[i]; - // } for (int i = 0; i < remain_row; ++i, out_ptr += out_w) { memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float)); } @@ -1391,3 +1366,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, } // namespace math } // namespace operators } // namespace paddle_mobile + +#endif // __aarch64__ +#endif // CONV_OP