diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index 8e108c150fc1b403c09140fdbc8ead5b30cd7981..42a98bc9442b2a619cf5882783bb63f5c4ea7db4 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -142,7 +142,7 @@ static bool conv_trans_weights_numc(const dtype* din, template void transpose(const Dtype* din, Dtype* dout, int m, int n) { // nxm == mxn - // 4x4 分块处理 + // 4x4 int cnt_n = n >> 2; int remain_n = n & 3; int cnt_m = m >> 2; @@ -160,7 +160,6 @@ void transpose(const Dtype* din, Dtype* dout, int m, int n) { float32x4_t din1 = vld1q_f32(din_ptr1); float32x4_t din2 = vld1q_f32(din_ptr2); float32x4_t din3 = vld1q_f32(din_ptr3); - dout_ptr0 += nn_num; Dtype* dout_ptr1 = dout_ptr0 + n; Dtype* dout_ptr2 = dout_ptr1 + n; Dtype* dout_ptr3 = dout_ptr2 + n; @@ -171,17 +170,27 @@ void transpose(const Dtype* din, Dtype* dout, int m, int n) { din_ptr0 += 4; din_ptr1 += 4; // a00 b00 c00 d00 a02 b02 c02 d02 - float32x4x2_t tmp00 = vtrnq_f32(tmp0.val[0], tmp2.val[0]); // a01 b01 c01 d01 a03 b03 c03 d03 - float32x4x2_t tmp02 = vtrnq_f32(tmp0.val[1], tmp2.val[1]); + float tmp_val1 = tmp0.val[0][2]; + float tmp_val2 = tmp0.val[0][3]; + tmp0.val[0][2] = tmp2.val[0][0]; + tmp0.val[0][3] = tmp2.val[0][1]; + float tmp_val3 = tmp0.val[1][2]; + float tmp_val4 = tmp0.val[1][3]; + tmp2.val[0][0] = tmp_val1; + tmp2.val[0][1] = tmp_val2; + tmp0.val[1][2] = tmp2.val[1][0]; + tmp0.val[1][3] = tmp2.val[1][1]; + tmp2.val[1][0] = tmp_val3; + tmp2.val[1][1] = tmp_val4; din_ptr2 += 4; din_ptr3 += 4; - vst1q_f32(dout_ptr0, tmp00.val[0]); - vst1q_f32(dout_ptr1, tmp02.val[0]); - vst1q_f32(dout_ptr2, tmp00.val[1]); - vst1q_f32(dout_ptr3, tmp02.val[1]); + vst1q_f32(dout_ptr0, tmp0.val[0]); + vst1q_f32(dout_ptr1, tmp0.val[1]); + dout_ptr0 += nn_num; + vst1q_f32(dout_ptr2, tmp2.val[0]); + vst1q_f32(dout_ptr3, tmp2.val[1]); } - dout_ptr0 += nn_num; for (int y = 0; y < remain_m; y++) { *dout_ptr0++ = *din_ptr0++; *dout_ptr0++ = *din_ptr1++; @@ -190,21 +199,21 @@ void transpose(const Dtype* din, Dtype* dout, int m, int n) { } } const Dtype* din_ptr0 = din + cnt_n * mm_num; + dout = dout + cnt_n * 4; for (int x = 0; x < remain_n; x++) { Dtype* dout_ptr0 = dout + x * 4; for (int y = 0; y < cnt_m; y++) { float32x4_t din0 = vld1q_f32(din_ptr0); - dout_ptr0 += nn_num; Dtype* dout_ptr1 = dout_ptr0 + n; Dtype* dout_ptr2 = dout_ptr1 + n; Dtype* dout_ptr3 = dout_ptr2 + n; din_ptr0 += 4; *dout_ptr0 = din0[0]; *dout_ptr1 = din0[1]; + dout_ptr0 += nn_num; *dout_ptr2 = din0[2]; *dout_ptr3 = din0[3]; } - dout_ptr0 += nn_num; for (int y = 0; y < remain_m; y++) { *dout_ptr0++ = *din_ptr0++; } diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index 3e6cbff0660be8f2542d059a39115bed52122ff1..8303851ece9dd2f1d053f9f4b888e42f2fdc0aad 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din, } else { if (pad_bottom > 1) { coef_h = 1.f / 3; - } else if (pad_bottom = 1) { + } else if (pad_bottom == 1) { coef_h = 0.5f; } else { coef_h = 1.f; diff --git a/lite/backends/arm/math/sequence_pool.cc b/lite/backends/arm/math/sequence_pool.cc index 9dd847b95dbc4ccd75c4b88f2233bc535b6e5b5c..ded76c1bdae354ca46a254309dcc6b3e216c92f4 100644 --- a/lite/backends/arm/math/sequence_pool.cc +++ b/lite/backends/arm/math/sequence_pool.cc @@ -46,15 +46,6 @@ void seq_pool_sum(const float* din, memcpy(dout_ptr, din_ptr, width * sizeof(float)); din_ptr += width; height = height - 1; -#if 0 ->>>>>>> 0e9dfda066fee168c3b72065c47e79fe705dd720 - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; ++w) { - dout_ptr[w] += din_ptr[w]; - } - din_ptr += width; - } -#else int cnt_w = width >> 2; int remain_w = width & 3; int cnt_h = height >> 2; @@ -110,7 +101,6 @@ void seq_pool_sum(const float* din, } dout_ptr++; } -#endif } } } @@ -204,14 +194,6 @@ void seq_pool_max(const float* din, memcpy(dout_ptr, din_ptr, width * sizeof(float)); din_ptr += width; height = height - 1; -#if 0 - for (int h = 0; h < rheight; h++) { - for (int w = 0; w < width; w++) { - dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]); - } - din_ptr += width; - } -#else int cnt_w = width >> 2; int remain_w = width & 3; int cnt_h = height >> 2; @@ -268,7 +250,6 @@ void seq_pool_max(const float* din, } dout_ptr++; } -#endif } } } diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc index 04ed25d88090cdbbc168ba9b2e6a05b278e4af0f..d4685b2d3f0afb6980b46cf5b6fa8ad64c8df324 100644 --- a/lite/kernels/arm/sequence_conv_compute.cc +++ b/lite/kernels/arm/sequence_conv_compute.cc @@ -103,17 +103,10 @@ void SequenceConvCompute::Run() { 1, 1, // stride_h, stride_w, dilation_h, dilation_w tmp_data); -#if 0 local_naive_transpose(tmp_data, sub_col_data, kernel_size * hidden_dim, input_row_end - input_row_begin); -#else - paddle::lite::arm::math::transpose(tmp_data, - sub_col_data, - kernel_size * hidden_dim, - input_row_end - input_row_begin); -#endif } }