format files

dfa731e1 · liuruilong · 3b82bfb5 · dfa731e1 · dfa731e1 · dfa731e1
4 changed file
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -45,34 +45,35 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
  auto scale_ptr = scale->data<float>();
  auto bias_ptr = bias->data<float>();

-
  //  Tensor inv_std;
  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));

-  PADDLE_MOBILE_ENFORCE(C == variance->numel(),  "C must equal to variance.numel()");
+  PADDLE_MOBILE_ENFORCE(C == variance->numel(),
+                        "C must equal to variance.numel()");

  int HXW = H * W;
  if (HXW > 32) {
    int NXC = N * C;
    float *inv_std_ptr = new float[NXC * 4];
-    float * volatile new_scale_ptr = new float[NXC *  4];
-    float * volatile new_bias_ptr = new float[NXC * 4];
+    float *volatile new_scale_ptr = new float[NXC * 4];
+    float *volatile new_bias_ptr = new float[NXC * 4];

    /// std = (var + epsilon).sqrt();
    /// inv_std = 1 / std;
    for (int i = 0; i < C * 4; i += 4) {
      inv_std_ptr[i] =
-              1 / static_cast<float>(pow((variance_ptr[i/4] + epsilon), 0.5));
+          1 / static_cast<float>(pow((variance_ptr[i / 4] + epsilon), 0.5));
      inv_std_ptr[i + 1] = inv_std_ptr[i];
      inv_std_ptr[i + 2] = inv_std_ptr[i];
      inv_std_ptr[i + 3] = inv_std_ptr[i];

-      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i/4];
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i / 4];
      new_scale_ptr[i + 1] = new_scale_ptr[i];
      new_scale_ptr[i + 2] = new_scale_ptr[i];
      new_scale_ptr[i + 3] = new_scale_ptr[i];

-      new_bias_ptr[i] = bias_ptr[i/4] - mean_ptr[i/4] * inv_std_ptr[i] * scale_ptr[i/4];
+      new_bias_ptr[i] =
+          bias_ptr[i / 4] - mean_ptr[i / 4] * inv_std_ptr[i] * scale_ptr[i / 4];

      new_bias_ptr[i + 1] = new_bias_ptr[i];
      new_bias_ptr[i + 2] = new_bias_ptr[i];
@@ -84,116 +85,116 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
    }

-
    asm volatile(
-    "subs %[N], %[N], #1                  \n\t"
-    "blt        end_n_%=                  \n\t"
-    "loop_n_%=:                           \n\t"
-
-    "subs %[C], %[C], #1                   \n\t"
-    "blt        end_c_%=                  \n\t"
-    "loop_c_%=:                           \n\t"
-
-    "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
-    "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
-
-    "mov r6, %[HXW]       \n\t"
-
-    "subs r6, r6, #32                       \n\t"
-    "blt        end_hw_%=                   \n\t"
-    "loop_hw_%=:                            \n\t"
-
-    "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
-    "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
-    "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
-    "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
-
-    "vmul.f32   q1, q1,   q9  \n\t"
-    "vmul.f32   q2, q2,   q9  \n\t"
-    "vmul.f32   q3, q3,   q9  \n\t"
-    "vmul.f32   q4, q4,   q9  \n\t"
-
-    "vmul.f32   q5, q5,   q9  \n\t"
-    "vmul.f32   q6, q6,   q9  \n\t"
-    "vmul.f32   q7, q7,   q9  \n\t"
-    "vmul.f32   q8, q8,   q9  \n\t"
-
-    "vadd.f32   q1,  q1,  q10 \n\t"
-    "vadd.f32   q2, q2,   q10  \n\t"
-    "vadd.f32   q3, q3,   q10  \n\t"
-    "vadd.f32   q4,  q4,  q10 \n\t"
-    "vadd.f32   q5,  q5,  q10 \n\t"
-    "vadd.f32   q6,  q6,  q10 \n\t"
-    "vadd.f32   q7,  q7,  q10 \n\t"
-    "vadd.f32   q8,  q8,  q10 \n\t"
-
-    "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
-    "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
-    "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
-    "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
-
-    "subs r6, r6, #32                    \n\t"
-    "bge        loop_hw_%=                \n\t"
-    "end_hw_%=:                           \n\t"
-
-    "cmp  r6, #0                                \n\t"
-    "bge  end_remainder_%=                      \n\t"
-    "mov r5, #4                             \n\t"
-    "mul  r6, r6, r5                            \n\t"
-    "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
-
-    "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
-    "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
-    "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
-    "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
-
-    "vmul.f32   q1, q1,   q9  \n\t"
-    "vmul.f32   q2, q2,   q9  \n\t"
-    "vmul.f32   q3, q3,   q9  \n\t"
-    "vmul.f32   q4, q4,   q9  \n\t"
-    "vmul.f32   q5, q5,   q9  \n\t"
-    "vmul.f32   q6, q6,   q9  \n\t"
-    "vmul.f32   q7, q7,   q9  \n\t"
-    "vmul.f32   q8, q8,   q9  \n\t"
-    "vadd.f32   q1,  q1,  q10 \n\t"
-    "vadd.f32   q2, q2,   q10  \n\t"
-    "vadd.f32   q3, q3,   q10  \n\t"
-    "vadd.f32   q4,  q4,  q10 \n\t"
-    "vadd.f32   q5,  q5,  q10 \n\t"
-    "vadd.f32   q6,  q6,  q10 \n\t"
-    "vadd.f32   q7,  q7,  q10 \n\t"
-    "vadd.f32   q8,  q8,  q10 \n\t"
-
-    "add %[out_ptr], %[out_ptr], r6       \n\t"
-    "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
-    "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
-    "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
-    "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
-
-    "end_remainder_%=:                     \n\t"
-
-    "subs %[C], %[C], #1                    \n\t"
-    "bge        loop_c_%=                   \n\t"
-    "end_c_%=:                              \n\t"
-
-    "subs %[N], %[N], #1                  \n\t"
-    "bge        loop_n_%=                \n\t"
-    "end_n_%=:                           \n\t"
-    :
-    :[input_x_ptr]"r"(input_x_ptr), [out_ptr]"r"(out_ptr), [new_scale_ptr]"r"(new_scale_ptr), [new_bias_ptr]"r"(new_bias_ptr),
-    [N]"r"(N), [C]"r"(C), [HXW]"r"(HXW)
-    :"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "r5", "r6"
-    );
-
-    delete [] inv_std_ptr;
-    delete [] new_scale_ptr;
-    delete [] new_bias_ptr;
+        "subs %[N], %[N], #1                  \n\t"
+        "blt        end_n_%=                  \n\t"
+        "loop_n_%=:                           \n\t"
+
+        "subs %[C], %[C], #1                   \n\t"
+        "blt        end_c_%=                  \n\t"
+        "loop_c_%=:                           \n\t"
+
+        "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
+        "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
+
+        "mov r6, %[HXW]       \n\t"
+
+        "subs r6, r6, #32                       \n\t"
+        "blt        end_hw_%=                   \n\t"
+        "loop_hw_%=:                            \n\t"
+
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+
+        "subs r6, r6, #32                    \n\t"
+        "bge        loop_hw_%=                \n\t"
+        "end_hw_%=:                           \n\t"
+
+        "cmp  r6, #0                                \n\t"
+        "bge  end_remainder_%=                      \n\t"
+        "mov r5, #4                             \n\t"
+        "mul  r6, r6, r5                            \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
+
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+
+        "add %[out_ptr], %[out_ptr], r6       \n\t"
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+
+        "end_remainder_%=:                     \n\t"
+
+        "subs %[C], %[C], #1                    \n\t"
+        "bge        loop_c_%=                   \n\t"
+        "end_c_%=:                              \n\t"
+
+        "subs %[N], %[N], #1                  \n\t"
+        "bge        loop_n_%=                \n\t"
+        "end_n_%=:                           \n\t"
+        :
+        : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
+          [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
+          [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+          "q10", "r5", "r6");
+
+    delete[] inv_std_ptr;
+    delete[] new_scale_ptr;
+    delete[] new_bias_ptr;

  } else {
    float *inv_std_ptr = new float[C];
    for (int i = 0; i < C; i++) {
      inv_std_ptr[i] =
-              1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+          1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
    }

    Tensor new_scale;
@@ -205,7 +206,8 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
    for (int i = 0; i < C; i++) {
      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-      new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+      new_bias_ptr[i] =
+          bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
      {
        for (int n = 0; n < N; n++) {
          for (int h = 0; h < H; h++) {
@@ -213,23 +215,22 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
            for (int w = 0; w < W; w++) {
              int index = tmp_index + w;
              out_ptr[index] =
-                      input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+                  input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
            }
          }
        }
      }
    }

-    delete [] inv_std_ptr;
-//    DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
-//    DLOG << "input_x_ptr : " << input_x_ptr[102];
-//    DLOG << "variance : " << variance_ptr[5];
-//    DLOG << "inv_std_ptr : " << inv_std_ptr[5];
-//    DLOG << "new_scale_ptr : " << new_scale_ptr[5];
-//    DLOG << "new_bias_ptr : " << new_bias_ptr[5];
-//    DLOG << "out_ptr : " << out_ptr[102];
+    delete[] inv_std_ptr;
+    //    DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
+    //    DLOG << "input_x_ptr : " << input_x_ptr[102];
+    //    DLOG << "variance : " << variance_ptr[5];
+    //    DLOG << "inv_std_ptr : " << inv_std_ptr[5];
+    //    DLOG << "new_scale_ptr : " << new_scale_ptr[5];
+    //    DLOG << "new_bias_ptr : " << new_bias_ptr[5];
+    //    DLOG << "out_ptr : " << out_ptr[102];
  }
-
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -38,70 +38,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  auto *out_ptr = out->mutable_data<float>();

  int numel = input_x->numel();
-//  if (numel > 64) {
-//    asm volatile(
-//        "pld        [%[input_x_ptr], #0]        \n\t"
-//        "vmov.f32   q8,    #0.0                 \n\t"
-//        "subs %[num], %[num], #32                \n\t"
-//        "blt        end_num_%=                  \n\t"
-//        "loop_num_%=:                           \n\t"
-//        "pld        [%[input_x_ptr], #1024]      \n\t"
-//
-//        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-//        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-//        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-//        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-//
-//        "vmax.f32 q0, q0, q8                   \n\t"
-//        "vmax.f32 q1, q1, q8                    \n\t"
-//        "vmax.f32 q2, q2, q8                   \n\t"
-//        "vmax.f32 q3, q3, q8                   \n\t"
-//        "vmax.f32 q4, q4, q8                   \n\t"
-//        "vmax.f32 q5, q5, q8                   \n\t"
-//        "vmax.f32 q6, q6, q8                   \n\t"
-//        "vmax.f32 q7, q7, q8                   \n\t"
-//
-//        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-//        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-//        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-//        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-//
-//        "subs %[num], %[num], #32              \n\t"
-//        "bge        loop_num_%=                \n\t"
-//        "end_num_%=:                           \n\t"
-//        "cmp %[num], #0                         \n\t"
-//        "bge   end_%=                          \n\t"
-//        "mov r6, #4                             \n\t"
-//        "mul r5, %[num], r6                     \n\t"
-//        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-//        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-//        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-//        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-//        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-//        "vmax.f32 q0, q0, q8                   \n\t"
-//        "vmax.f32 q1, q1, q8                    \n\t"
-//        "vmax.f32 q2, q2, q8                   \n\t"
-//        "vmax.f32 q3, q3, q8                   \n\t"
-//        "vmax.f32 q4, q4, q8                   \n\t"
-//        "vmax.f32 q5, q5, q8                   \n\t"
-//        "vmax.f32 q6, q6, q8                   \n\t"
-//        "vmax.f32 q7, q7, q8                   \n\t"
-//        "add %[out_ptr], %[out_ptr], r5       \n\t"
-//        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-//        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-//        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-//        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-//        "end_%=:                                \n\t"
-//        :
-//        :
-//        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
-//        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
-//          "r6");
-//  } else {
-    ReluFunctor<float> func_;
-    math::Transform trans;
-    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-//  }
+  //  if (numel > 64) {
+  //    asm volatile(
+  //        "pld        [%[input_x_ptr], #0]        \n\t"
+  //        "vmov.f32   q8,    #0.0                 \n\t"
+  //        "subs %[num], %[num], #32                \n\t"
+  //        "blt        end_num_%=                  \n\t"
+  //        "loop_num_%=:                           \n\t"
+  //        "pld        [%[input_x_ptr], #1024]      \n\t"
+  //
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //
+  //        "subs %[num], %[num], #32              \n\t"
+  //        "bge        loop_num_%=                \n\t"
+  //        "end_num_%=:                           \n\t"
+  //        "cmp %[num], #0                         \n\t"
+  //        "bge   end_%=                          \n\t"
+  //        "mov r6, #4                             \n\t"
+  //        "mul r5, %[num], r6                     \n\t"
+  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //        "end_%=:                                \n\t"
+  //        :
+  //        :
+  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
+  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+  //        "q7", "q8", "r5",
+  //          "r6");
+  //  } else {
+  ReluFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
+  //  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/test/common/test_lib_size.h
+++ b/test/common/test_lib_size.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
 #define PADDLE_MOBILE_TEST_LIB_SIZE_H

-#include <vector>
 #include <pthread.h>
 #include <thread>
+#include <vector>
 //#include <list>
 //#include <tuple>
 //#include <typeinfo>
@@ -74,7 +74,7 @@ void foo() {
  //    int z = 10;
  //  }

-//  std::shared_ptr<int> s1 = std::make_shared<int>();
+  //  std::shared_ptr<int> s1 = std::make_shared<int>();

  //  std::stringstream ss;
  //  ss << "12345";

--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -137,7 +137,8 @@ int main() {
  auto *inputx1_ptr = inputx1.data<float>();

  paddle_mobile::framework::Tensor mean;
-  SetupTensor<float>(&mean, {256}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&mean, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
  auto *mean_ptr = mean.data<float>();

  paddle_mobile::framework::Tensor scale;
@@ -151,7 +152,8 @@ int main() {
  auto *variance_ptr = variance.data<float>();

  paddle_mobile::framework::Tensor bias;
-  SetupTensor<float>(&bias, {256}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&bias, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
  auto *bias_ptr = bias.data<float>();

  paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(