From 080def5a22bf850b1fd368e9d03f547d2b87b7a4 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Tue, 13 Nov 2018 17:10:12 +0800 Subject: [PATCH] Refine compiler and winograd conv implementation --- src/operators/kernel/arm/conv_kernel.cpp | 2 - .../math/winograd/winograd_transform_f6k3.cpp | 208 +++++++++--------- test/operators/test_conv_op.cpp | 24 +- tools/build.sh | 4 +- 4 files changed, 110 insertions(+), 128 deletions(-) diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 88e2602661..be518d3a2c 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -17,8 +17,6 @@ limitations under the License. */ #include "operators/kernel/conv_kernel.h" #include "operators/kernel/central-arm-func/conv_arm_func.h" -#include - namespace paddle_mobile { namespace operators { diff --git a/src/operators/math/winograd/winograd_transform_f6k3.cpp b/src/operators/math/winograd/winograd_transform_f6k3.cpp index e2a6d4558b..d2fe88a9f2 100644 --- a/src/operators/math/winograd/winograd_transform_f6k3.cpp +++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp @@ -885,7 +885,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, // compute U*V first framework::Tensor uv_trans; framework::DDim shape = - framework::make_ddim(std::vector{4 * out_channel, 8 * tiles, 64}); + framework::make_ddim(std::vector{out_channel, tiles, 64, 32}); float *uv_trans_ptr = uv_trans.mutable_data(shape); memset(uv_trans_ptr, 0, uv_trans.numel() * sizeof(float)); const float *input_ptr = input.data(); @@ -894,17 +894,12 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, #pragma omp parallel for for (int i = 0; i < out_channel; ++i) { float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32); - for (int k = 0; k < 64; ++k) { - for (int j = 0; j < tiles; ++j) { + for (int j = 0; j < tiles; ++j) { + for (int k = 0; k < 64; ++k) { const float *w_ptr = weight_ptr + (i * 64 + k) * in_channel * 4; const float *in_ptr = input_ptr + (j * 64 + k) * in_channel * 8; - float *out0 = uv_ptr + (8 * j) * 64 + k; // out channel 0 - float *out1 = out0 + 8 * tiles * 64; // out channel 1 - float *out2 = out1 + 8 * tiles * 64; // out channel 2 - float *out3 = out2 + 8 * tiles * 64; // out channel 3 int inter_channel = in_channel >> 1; int remain_channel = in_channel & 0x1; - int steps = 64 * sizeof(float); asm volatile( "veor q8, q8, q8 \n" "veor q9, q9, q9 \n" @@ -921,6 +916,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, "loop_2c_%=: \n" "vld1.32 {d0-d3}, [%[w_ptr]]! \n" "vld1.32 {d4-d7}, [%[in_ptr]]! \n" + "vld1.32 {d8-d11}, [%[in_ptr]]! \n" "vmla.f32 q8, q2, d0[0] \n" "vmla.f32 q9, q3, d0[0] \n" "vmla.f32 q10, q2, d0[1] \n" @@ -930,7 +926,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, "vmla.f32 q14, q2, d1[1] \n" "vmla.f32 q15, q3, d1[1] \n" - "vld1.32 {d8-d11}, [%[in_ptr]]! \n" "vmla.f32 q8, q4, d2[0] \n" "vmla.f32 q9, q5, d2[0] \n" "vmla.f32 q10, q4, d2[1] \n" @@ -966,46 +961,14 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, "bne loop_c_%= \n" "store_res_%=: \n" - "vst1.32 {d16[0]}, [%[out0]], %[steps] \n" - "vst1.32 {d16[1]}, [%[out0]], %[steps] \n" - "vst1.32 {d17[0]}, [%[out0]], %[steps] \n" - "vst1.32 {d17[1]}, [%[out0]], %[steps] \n" - "vst1.32 {d18[0]}, [%[out0]], %[steps] \n" - "vst1.32 {d18[1]}, [%[out0]], %[steps] \n" - "vst1.32 {d19[0]}, [%[out0]], %[steps] \n" - "vst1.32 {d19[1]}, [%[out0]], %[steps] \n" - - "vst1.32 {d20[0]}, [%[out1]], %[steps] \n" - "vst1.32 {d20[1]}, [%[out1]], %[steps] \n" - "vst1.32 {d21[0]}, [%[out1]], %[steps] \n" - "vst1.32 {d21[1]}, [%[out1]], %[steps] \n" - "vst1.32 {d22[0]}, [%[out1]], %[steps] \n" - "vst1.32 {d22[1]}, [%[out1]], %[steps] \n" - "vst1.32 {d23[0]}, [%[out1]], %[steps] \n" - "vst1.32 {d23[1]}, [%[out1]], %[steps] \n" - - "vst1.32 {d24[0]}, [%[out2]], %[steps] \n" - "vst1.32 {d24[1]}, [%[out2]], %[steps] \n" - "vst1.32 {d25[0]}, [%[out2]], %[steps] \n" - "vst1.32 {d25[1]}, [%[out2]], %[steps] \n" - "vst1.32 {d26[0]}, [%[out2]], %[steps] \n" - "vst1.32 {d26[1]}, [%[out2]], %[steps] \n" - "vst1.32 {d27[0]}, [%[out2]], %[steps] \n" - "vst1.32 {d27[1]}, [%[out2]], %[steps] \n" - - "vst1.32 {d28[0]}, [%[out3]], %[steps] \n" - "vst1.32 {d28[1]}, [%[out3]], %[steps] \n" - "vst1.32 {d29[0]}, [%[out3]], %[steps] \n" - "vst1.32 {d29[1]}, [%[out3]], %[steps] \n" - "vst1.32 {d30[0]}, [%[out3]], %[steps] \n" - "vst1.32 {d30[1]}, [%[out3]], %[steps] \n" - "vst1.32 {d31[0]}, [%[out3]], %[steps] \n" - "vst1.32 {d31[1]}, [%[out3]], %[steps] \n" - : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [out0] "+r"(out0), - [out1] "+r"(out1), [out2] "+r"(out2), [out3] "+r"(out3), + "vst1.32 {d16-d19}, [%[uv_ptr]]! \n" + "vst1.32 {d20-d23}, [%[uv_ptr]]! \n" + "vst1.32 {d24-d27}, [%[uv_ptr]]! \n" + "vst1.32 {d28-d31}, [%[uv_ptr]]! \n" + : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr), [remain_channel] "+r"(remain_channel), [inter_channel] "+r"(inter_channel) - : [steps] "r"(steps) + : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } @@ -1027,76 +990,105 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, int remain_h = out_h - out_h / 6 * 6; int remain_w = out_w - out_w / 6 * 6; float *output_ptr = output->mutable_data(); - out_channel = output->dims()[1]; - int uv_image_size = uv_trans.dims()[1] * 64; float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f}; #pragma omp parallel for - for (int oc = 0; oc < out_channel; ++oc) { + for (int oc = 0; oc < output->dims()[1]; ++oc) { float at_m[48]; // [6][8] float output_tmp[36]; // [6][6], temporarily restore results - const float *uv_ptr = uv_trans_ptr + oc * uv_image_size; + // (oc / 4) * tiles * 64 * 32 + (oc & 0x3) * 8 + const float *uv_ptr = + uv_trans_ptr + (oc >> 2) * tiles * 64 * 32 + (oc & 0x3) * 8; for (int tile_h = 0; tile_h < h_tiles; ++tile_h) { for (int tile_w = 0; tile_w < w_tiles; ++tile_w) { float *at_m_ptr = at_m; + int tile_indics = tile_h * w_tiles + tile_w; + int tile_block = tile_indics >> 3; + int block_indics = tile_indics & 0x7; + const float *uv_ptr0 = uv_ptr + tile_block * 64 * 32 + block_indics; + int steps = 32 * sizeof(float); asm volatile( - "vld1.32 {d0-d1}, [%[tm_ptr]] \n" - "mov r0, #2 \n" - "loop_%=: \n" - "vld1.32 {d2-d5}, [%[uv_ptr]]! \n" - "vld1.32 {d6-d9}, [%[uv_ptr]]! \n" - "vld1.32 {d10-d13}, [%[uv_ptr]]! \n" - "vld1.32 {d14-d17}, [%[uv_ptr]]! \n" - "vtrn.32 q1, q3 \n" - "vtrn.32 q2, q4 \n" - "vtrn.32 q5, q7 \n" - "vtrn.32 q6, q8 \n" - "vswp.32 d3, d10 \n" // q1: m0, q5: m2 - "vswp.32 d7, d14 \n" // q3: m1, q7: m3 - "vswp.32 d5, d12 \n" // q2: m4, q6: m6 - "vswp.32 d9, d16 \n" // q4: m5, q8: m7 - - "vadd.f32 q9, q3, q5 \n" // m1 + m2 - "vadd.f32 q10, q7, q2 \n" // m3 + m4 - "vadd.f32 q11, q4, q6 \n" // m5 + m6 - "vsub.f32 q12, q3, q5 \n" // m1 - m2 - "vsub.f32 q13, q7, q2 \n" // m3 - m4 - "vsub.f32 q14, q4, q6 \n" // m5 - m6 - "vmul.f32 q2, q13, d0[0] \n" // 2 * (m3 - m4) - "vmul.f32 q3, q11, d0[0] \n" // 2 * (m5 + m6) - - "vadd.f32 q15, q1, q9 \n" - "vadd.f32 q15, q15, q10 \n" - "vmla.f32 q15, q3, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vadd.f32 q15, q12, q2 \n" - "vmla.f32 q15, q14, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vmov.32 q15, q9 \n" - "vmla.f32 q15, q10, d0[1] \n" - "vmla.f32 q15, q11, d1[0] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vmov.32 q15, q12 \n" - "vmla.f32 q15, q13, d1[0] \n" - "vmla.f32 q15, q14, d0[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vadd.f32 q15, q9, q3 \n" - "vmla.f32 q15, q10, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "vadd.f32 q15, q12, q8 \n" - "vadd.f32 q15, q15, q14 \n" - "vmla.f32 q15, q2, d1[1] \n" - "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" - - "subs r0, #1 \n" - "bne loop_%= \n" - : [uv_ptr] "+r"(uv_ptr), [at_m_ptr] "+r"(at_m_ptr) - : [tm_ptr] "r"((float *)transform_matrix) + "vld1.32 {d0-d1}, [%[tm_ptr]] \n" + "mov r0, #2 \n" + + "loop_%=: \n" + "vld1.32 {d2[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d6[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d10[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d14[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d4[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d8[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d12[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d16[0]}, [%[uv_ptr0]], %[steps] \n" + + "vld1.32 {d2[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d6[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d10[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d14[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d4[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d8[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d12[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d16[1]}, [%[uv_ptr0]], %[steps] \n" + + "vld1.32 {d3[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d7[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d11[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d15[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d5[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d9[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d13[0]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d17[0]}, [%[uv_ptr0]], %[steps] \n" + + "vld1.32 {d3[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d7[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d11[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d15[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d5[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d9[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d13[1]}, [%[uv_ptr0]], %[steps] \n" + "vld1.32 {d17[1]}, [%[uv_ptr0]], %[steps] \n" + + "vadd.f32 q9, q3, q5 \n" // m1 + m2 + "vadd.f32 q10, q7, q2 \n" // m3 + m4 + "vadd.f32 q11, q4, q6 \n" // m5 + m6 + "vsub.f32 q12, q3, q5 \n" // m1 - m2 + "vsub.f32 q13, q7, q2 \n" // m3 - m4 + "vsub.f32 q14, q4, q6 \n" // m5 - m6 + "vmul.f32 q2, q13, d0[0] \n" // 2 * (m3 - m4) + "vmul.f32 q3, q11, d0[0] \n" // 2 * (m5 + m6) + + "vadd.f32 q15, q1, q9 \n" + "vadd.f32 q15, q15, q10 \n" + "vmla.f32 q15, q3, d1[1] \n" + "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" + + "vadd.f32 q15, q12, q2 \n" + "vmla.f32 q15, q14, d1[1] \n" + "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" + + "vmov.32 q15, q9 \n" + "vmla.f32 q15, q10, d0[1] \n" + "vmla.f32 q15, q11, d1[0] \n" + "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" + + "vmov.32 q15, q12 \n" + "vmla.f32 q15, q13, d1[0] \n" + "vmla.f32 q15, q14, d0[1] \n" + "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" + + "vadd.f32 q15, q9, q3 \n" + "vmla.f32 q15, q10, d1[1] \n" + "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" + + "vadd.f32 q15, q12, q8 \n" + "vadd.f32 q15, q15, q14 \n" + "vmla.f32 q15, q2, d1[1] \n" + "vst1.32 {d30-d31}, [%[at_m_ptr]]! \n" + + "subs r0, #1 \n" + "bne loop_%= \n" + : [uv_ptr0] "+r"(uv_ptr0), [at_m_ptr] "+r"(at_m_ptr) + : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); diff --git a/test/operators/test_conv_op.cpp b/test/operators/test_conv_op.cpp index fa5afe01bd..bd0fbdad4d 100644 --- a/test/operators/test_conv_op.cpp +++ b/test/operators/test_conv_op.cpp @@ -204,9 +204,15 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) { Otype *output_cmp_data = output_cmp.data(); for (int i = 0; i < output->numel(); ++i) { float gap = output_data[i] - output_cmp_data[i]; - PADDLE_MOBILE_ENFORCE(std::abs(gap / output_data[i]) < 1e-3, + PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3, "output[%d] = %d, output_cmp[%d] = %d", i, output_data[i], i, output_cmp_data[i]); + // if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) { + // LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i] + // << ", output_cmp_data[" << i << "] = " << + // output_cmp_data[i]; + // return 1; + // } } delete op; return 0; @@ -234,82 +240,66 @@ int main(int argc, char *argv[]) { LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 0, stride = 2 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 1, stride = 2 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 3, stride = 2 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 1, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 3, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 5, stride = 3 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 7, pad = 3, stride = 4 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 3, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 3, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 3, pad = 1, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 3, pad = 1, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 5, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 5, pad = 0, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 5, pad = 2, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, in_width, out_channels); - // kernel = 5, pad = 2, stride = 1 LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1"; paddle_mobile::TestConvOp(in_channels, in_height, diff --git a/tools/build.sh b/tools/build.sh index c655410571..330bc208ef 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -69,6 +69,7 @@ build_for_android() { -DANDROID_ABI="${ABI}" \ -DCMAKE_BUILD_TYPE="${MODE}" \ -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ + -DANDROID_TOOLCHAIN='clang' \ -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \ -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ -DANDROID_STL=c++_static \ @@ -82,6 +83,7 @@ build_for_android() { -DANDROID_ABI="${ABI}" \ -DCMAKE_BUILD_TYPE="${MODE}" \ -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ + -DANDROID_TOOLCHAIN='clang' \ -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \ -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ -DANDROID_STL=c++_static \ @@ -210,4 +212,4 @@ else else build_error "$1" fi -fi \ No newline at end of file +fi -- GitLab