diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp index ec3e71fff53e86d6f4737e5eb0acd141a8ae310e..30d922a777b67a55a7d0dfa98a55144bcb569d49 100644 --- a/src/operators/kernel/arm/batchnorm_kernel.cpp +++ b/src/operators/kernel/arm/batchnorm_kernel.cpp @@ -45,34 +45,35 @@ void BatchNormKernel::Compute(const BatchNormParam ¶m) const { auto scale_ptr = scale->data(); auto bias_ptr = bias->data(); - // Tensor inv_std; // auto inv_std_ptr = inv_std.mutable_data(make_ddim({C})); - PADDLE_MOBILE_ENFORCE(C == variance->numel(), "C must equal to variance.numel()"); + PADDLE_MOBILE_ENFORCE(C == variance->numel(), + "C must equal to variance.numel()"); int HXW = H * W; if (HXW > 32) { int NXC = N * C; float *inv_std_ptr = new float[NXC * 4]; - float * volatile new_scale_ptr = new float[NXC * 4]; - float * volatile new_bias_ptr = new float[NXC * 4]; + float *volatile new_scale_ptr = new float[NXC * 4]; + float *volatile new_bias_ptr = new float[NXC * 4]; /// std = (var + epsilon).sqrt(); /// inv_std = 1 / std; for (int i = 0; i < C * 4; i += 4) { inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i/4] + epsilon), 0.5)); + 1 / static_cast(pow((variance_ptr[i / 4] + epsilon), 0.5)); inv_std_ptr[i + 1] = inv_std_ptr[i]; inv_std_ptr[i + 2] = inv_std_ptr[i]; inv_std_ptr[i + 3] = inv_std_ptr[i]; - new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i/4]; + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i / 4]; new_scale_ptr[i + 1] = new_scale_ptr[i]; new_scale_ptr[i + 2] = new_scale_ptr[i]; new_scale_ptr[i + 3] = new_scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i/4] - mean_ptr[i/4] * inv_std_ptr[i] * scale_ptr[i/4]; + new_bias_ptr[i] = + bias_ptr[i / 4] - mean_ptr[i / 4] * inv_std_ptr[i] * scale_ptr[i / 4]; new_bias_ptr[i + 1] = new_bias_ptr[i]; new_bias_ptr[i + 2] = new_bias_ptr[i]; @@ -84,116 +85,116 @@ void BatchNormKernel::Compute(const BatchNormParam ¶m) const { new_bias_ptr[j] = new_bias_ptr[j - C * 4]; } - asm volatile( - "subs %[N], %[N], #1 \n\t" - "blt end_n_%= \n\t" - "loop_n_%=: \n\t" - - "subs %[C], %[C], #1 \n\t" - "blt end_c_%= \n\t" - "loop_c_%=: \n\t" - - "vld1.32 {q9}, [%[new_scale_ptr]]! \n\t" - "vld1.32 {q10}, [%[new_bias_ptr]]! \n\t" - - "mov r6, %[HXW] \n\t" - - "subs r6, r6, #32 \n\t" - "blt end_hw_%= \n\t" - "loop_hw_%=: \n\t" - - "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" - - "vmul.f32 q1, q1, q9 \n\t" - "vmul.f32 q2, q2, q9 \n\t" - "vmul.f32 q3, q3, q9 \n\t" - "vmul.f32 q4, q4, q9 \n\t" - - "vmul.f32 q5, q5, q9 \n\t" - "vmul.f32 q6, q6, q9 \n\t" - "vmul.f32 q7, q7, q9 \n\t" - "vmul.f32 q8, q8, q9 \n\t" - - "vadd.f32 q1, q1, q10 \n\t" - "vadd.f32 q2, q2, q10 \n\t" - "vadd.f32 q3, q3, q10 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vadd.f32 q5, q5, q10 \n\t" - "vadd.f32 q6, q6, q10 \n\t" - "vadd.f32 q7, q7, q10 \n\t" - "vadd.f32 q8, q8, q10 \n\t" - - "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" - "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" - "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" - "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" - - "subs r6, r6, #32 \n\t" - "bge loop_hw_%= \n\t" - "end_hw_%=: \n\t" - - "cmp r6, #0 \n\t" - "bge end_remainder_%= \n\t" - "mov r5, #4 \n\t" - "mul r6, r6, r5 \n\t" - "add %[input_x_ptr], %[input_x_ptr], r6 \n\t" - - "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" - "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" - - "vmul.f32 q1, q1, q9 \n\t" - "vmul.f32 q2, q2, q9 \n\t" - "vmul.f32 q3, q3, q9 \n\t" - "vmul.f32 q4, q4, q9 \n\t" - "vmul.f32 q5, q5, q9 \n\t" - "vmul.f32 q6, q6, q9 \n\t" - "vmul.f32 q7, q7, q9 \n\t" - "vmul.f32 q8, q8, q9 \n\t" - "vadd.f32 q1, q1, q10 \n\t" - "vadd.f32 q2, q2, q10 \n\t" - "vadd.f32 q3, q3, q10 \n\t" - "vadd.f32 q4, q4, q10 \n\t" - "vadd.f32 q5, q5, q10 \n\t" - "vadd.f32 q6, q6, q10 \n\t" - "vadd.f32 q7, q7, q10 \n\t" - "vadd.f32 q8, q8, q10 \n\t" - - "add %[out_ptr], %[out_ptr], r6 \n\t" - "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" - "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" - "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" - "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" - - "end_remainder_%=: \n\t" - - "subs %[C], %[C], #1 \n\t" - "bge loop_c_%= \n\t" - "end_c_%=: \n\t" - - "subs %[N], %[N], #1 \n\t" - "bge loop_n_%= \n\t" - "end_n_%=: \n\t" - : - :[input_x_ptr]"r"(input_x_ptr), [out_ptr]"r"(out_ptr), [new_scale_ptr]"r"(new_scale_ptr), [new_bias_ptr]"r"(new_bias_ptr), - [N]"r"(N), [C]"r"(C), [HXW]"r"(HXW) - :"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "r5", "r6" - ); - - delete [] inv_std_ptr; - delete [] new_scale_ptr; - delete [] new_bias_ptr; + "subs %[N], %[N], #1 \n\t" + "blt end_n_%= \n\t" + "loop_n_%=: \n\t" + + "subs %[C], %[C], #1 \n\t" + "blt end_c_%= \n\t" + "loop_c_%=: \n\t" + + "vld1.32 {q9}, [%[new_scale_ptr]]! \n\t" + "vld1.32 {q10}, [%[new_bias_ptr]]! \n\t" + + "mov r6, %[HXW] \n\t" + + "subs r6, r6, #32 \n\t" + "blt end_hw_%= \n\t" + "loop_hw_%=: \n\t" + + "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" + + "vmul.f32 q1, q1, q9 \n\t" + "vmul.f32 q2, q2, q9 \n\t" + "vmul.f32 q3, q3, q9 \n\t" + "vmul.f32 q4, q4, q9 \n\t" + + "vmul.f32 q5, q5, q9 \n\t" + "vmul.f32 q6, q6, q9 \n\t" + "vmul.f32 q7, q7, q9 \n\t" + "vmul.f32 q8, q8, q9 \n\t" + + "vadd.f32 q1, q1, q10 \n\t" + "vadd.f32 q2, q2, q10 \n\t" + "vadd.f32 q3, q3, q10 \n\t" + "vadd.f32 q4, q4, q10 \n\t" + "vadd.f32 q5, q5, q10 \n\t" + "vadd.f32 q6, q6, q10 \n\t" + "vadd.f32 q7, q7, q10 \n\t" + "vadd.f32 q8, q8, q10 \n\t" + + "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" + "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" + "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" + "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" + + "subs r6, r6, #32 \n\t" + "bge loop_hw_%= \n\t" + "end_hw_%=: \n\t" + + "cmp r6, #0 \n\t" + "bge end_remainder_%= \n\t" + "mov r5, #4 \n\t" + "mul r6, r6, r5 \n\t" + "add %[input_x_ptr], %[input_x_ptr], r6 \n\t" + + "vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t" + + "vmul.f32 q1, q1, q9 \n\t" + "vmul.f32 q2, q2, q9 \n\t" + "vmul.f32 q3, q3, q9 \n\t" + "vmul.f32 q4, q4, q9 \n\t" + "vmul.f32 q5, q5, q9 \n\t" + "vmul.f32 q6, q6, q9 \n\t" + "vmul.f32 q7, q7, q9 \n\t" + "vmul.f32 q8, q8, q9 \n\t" + "vadd.f32 q1, q1, q10 \n\t" + "vadd.f32 q2, q2, q10 \n\t" + "vadd.f32 q3, q3, q10 \n\t" + "vadd.f32 q4, q4, q10 \n\t" + "vadd.f32 q5, q5, q10 \n\t" + "vadd.f32 q6, q6, q10 \n\t" + "vadd.f32 q7, q7, q10 \n\t" + "vadd.f32 q8, q8, q10 \n\t" + + "add %[out_ptr], %[out_ptr], r6 \n\t" + "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t" + "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t" + "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t" + "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t" + + "end_remainder_%=: \n\t" + + "subs %[C], %[C], #1 \n\t" + "bge loop_c_%= \n\t" + "end_c_%=: \n\t" + + "subs %[N], %[N], #1 \n\t" + "bge loop_n_%= \n\t" + "end_n_%=: \n\t" + : + : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr), + [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr), + [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "r5", "r6"); + + delete[] inv_std_ptr; + delete[] new_scale_ptr; + delete[] new_bias_ptr; } else { float *inv_std_ptr = new float[C]; for (int i = 0; i < C; i++) { inv_std_ptr[i] = - 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); } Tensor new_scale; @@ -205,7 +206,8 @@ void BatchNormKernel::Compute(const BatchNormParam ¶m) const { /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) for (int i = 0; i < C; i++) { new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; - new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = + bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; { for (int n = 0; n < N; n++) { for (int h = 0; h < H; h++) { @@ -213,23 +215,22 @@ void BatchNormKernel::Compute(const BatchNormParam ¶m) const { for (int w = 0; w < W; w++) { int index = tmp_index + w; out_ptr[index] = - input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; + input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; } } } } } - delete [] inv_std_ptr; -// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :"; -// DLOG << "input_x_ptr : " << input_x_ptr[102]; -// DLOG << "variance : " << variance_ptr[5]; -// DLOG << "inv_std_ptr : " << inv_std_ptr[5]; -// DLOG << "new_scale_ptr : " << new_scale_ptr[5]; -// DLOG << "new_bias_ptr : " << new_bias_ptr[5]; -// DLOG << "out_ptr : " << out_ptr[102]; + delete[] inv_std_ptr; + // DLOG << "input[2,5,1,0](input[102]) ,channel 5 :"; + // DLOG << "input_x_ptr : " << input_x_ptr[102]; + // DLOG << "variance : " << variance_ptr[5]; + // DLOG << "inv_std_ptr : " << inv_std_ptr[5]; + // DLOG << "new_scale_ptr : " << new_scale_ptr[5]; + // DLOG << "new_bias_ptr : " << new_bias_ptr[5]; + // DLOG << "out_ptr : " << out_ptr[102]; } - } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp index e40014a8b87435134cce259f217d058f58b26a06..86bf53e5a1e5ecc285c9e9f20cb412d290d535d1 100644 --- a/src/operators/kernel/arm/relu_kernel.cpp +++ b/src/operators/kernel/arm/relu_kernel.cpp @@ -38,70 +38,71 @@ void ReluKernel::Compute(const ReluParam ¶m) const { auto *out_ptr = out->mutable_data(); int numel = input_x->numel(); -// if (numel > 64) { -// asm volatile( -// "pld [%[input_x_ptr], #0] \n\t" -// "vmov.f32 q8, #0.0 \n\t" -// "subs %[num], %[num], #32 \n\t" -// "blt end_num_%= \n\t" -// "loop_num_%=: \n\t" -// "pld [%[input_x_ptr], #1024] \n\t" -// -// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" -// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" -// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" -// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" -// -// "vmax.f32 q0, q0, q8 \n\t" -// "vmax.f32 q1, q1, q8 \n\t" -// "vmax.f32 q2, q2, q8 \n\t" -// "vmax.f32 q3, q3, q8 \n\t" -// "vmax.f32 q4, q4, q8 \n\t" -// "vmax.f32 q5, q5, q8 \n\t" -// "vmax.f32 q6, q6, q8 \n\t" -// "vmax.f32 q7, q7, q8 \n\t" -// -// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" -// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" -// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" -// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" -// -// "subs %[num], %[num], #32 \n\t" -// "bge loop_num_%= \n\t" -// "end_num_%=: \n\t" -// "cmp %[num], #0 \n\t" -// "bge end_%= \n\t" -// "mov r6, #4 \n\t" -// "mul r5, %[num], r6 \n\t" -// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" -// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" -// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" -// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" -// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" -// "vmax.f32 q0, q0, q8 \n\t" -// "vmax.f32 q1, q1, q8 \n\t" -// "vmax.f32 q2, q2, q8 \n\t" -// "vmax.f32 q3, q3, q8 \n\t" -// "vmax.f32 q4, q4, q8 \n\t" -// "vmax.f32 q5, q5, q8 \n\t" -// "vmax.f32 q6, q6, q8 \n\t" -// "vmax.f32 q7, q7, q8 \n\t" -// "add %[out_ptr], %[out_ptr], r5 \n\t" -// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" -// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" -// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" -// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" -// "end_%=: \n\t" -// : -// : -// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel) -// : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5", -// "r6"); -// } else { - ReluFunctor func_; - math::Transform trans; - trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); -// } + // if (numel > 64) { + // asm volatile( + // "pld [%[input_x_ptr], #0] \n\t" + // "vmov.f32 q8, #0.0 \n\t" + // "subs %[num], %[num], #32 \n\t" + // "blt end_num_%= \n\t" + // "loop_num_%=: \n\t" + // "pld [%[input_x_ptr], #1024] \n\t" + // + // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" + // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" + // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" + // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" + // + // "vmax.f32 q0, q0, q8 \n\t" + // "vmax.f32 q1, q1, q8 \n\t" + // "vmax.f32 q2, q2, q8 \n\t" + // "vmax.f32 q3, q3, q8 \n\t" + // "vmax.f32 q4, q4, q8 \n\t" + // "vmax.f32 q5, q5, q8 \n\t" + // "vmax.f32 q6, q6, q8 \n\t" + // "vmax.f32 q7, q7, q8 \n\t" + // + // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" + // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" + // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" + // + // "subs %[num], %[num], #32 \n\t" + // "bge loop_num_%= \n\t" + // "end_num_%=: \n\t" + // "cmp %[num], #0 \n\t" + // "bge end_%= \n\t" + // "mov r6, #4 \n\t" + // "mul r5, %[num], r6 \n\t" + // "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" + // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" + // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" + // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" + // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" + // "vmax.f32 q0, q0, q8 \n\t" + // "vmax.f32 q1, q1, q8 \n\t" + // "vmax.f32 q2, q2, q8 \n\t" + // "vmax.f32 q3, q3, q8 \n\t" + // "vmax.f32 q4, q4, q8 \n\t" + // "vmax.f32 q5, q5, q8 \n\t" + // "vmax.f32 q6, q6, q8 \n\t" + // "vmax.f32 q7, q7, q8 \n\t" + // "add %[out_ptr], %[out_ptr], r5 \n\t" + // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" + // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" + // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" + // "end_%=: \n\t" + // : + // : + // [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] + // "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", + // "q7", "q8", "r5", + // "r6"); + // } else { + ReluFunctor func_; + math::Transform trans; + trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); + // } } } // namespace operators } // namespace paddle_mobile diff --git a/test/common/test_lib_size.h b/test/common/test_lib_size.h index 754c93df18040d7a2cd2b2038f7fdb72c88ff777..a00a5afe12f952a7bc47ab62ba1d07a7879cebec 100644 --- a/test/common/test_lib_size.h +++ b/test/common/test_lib_size.h @@ -19,9 +19,9 @@ limitations under the License. */ #ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H #define PADDLE_MOBILE_TEST_LIB_SIZE_H -#include #include #include +#include //#include //#include //#include @@ -74,7 +74,7 @@ void foo() { // int z = 10; // } -// std::shared_ptr s1 = std::make_shared(); + // std::shared_ptr s1 = std::make_shared(); // std::stringstream ss; // ss << "12345"; diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp index 58bd9a1a480d795c88c4543a5ee1480e5d768d8f..4ccad8c1512036c2400a09575b3775e75b26acce 100644 --- a/test/operators/test_batchnorm_op.cpp +++ b/test/operators/test_batchnorm_op.cpp @@ -137,7 +137,8 @@ int main() { auto *inputx1_ptr = inputx1.data(); paddle_mobile::framework::Tensor mean; - SetupTensor(&mean, {256}, static_cast(0), static_cast(1)); + SetupTensor(&mean, {256}, static_cast(0), + static_cast(1)); auto *mean_ptr = mean.data(); paddle_mobile::framework::Tensor scale; @@ -151,7 +152,8 @@ int main() { auto *variance_ptr = variance.data(); paddle_mobile::framework::Tensor bias; - SetupTensor(&bias, {256}, static_cast(0), static_cast(1)); + SetupTensor(&bias, {256}, static_cast(0), + static_cast(1)); auto *bias_ptr = bias.data(); paddle_mobile::framework::TestBatchNormOp testBatchNormOp(