optimize batchnormal op kernel

3b82bfb5 · liuruilong · b37c8fef · 3b82bfb5 · 3b82bfb5 · 3b82bfb5
10 changed file
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -23,7 +23,6 @@ namespace operators {

 template <>
 void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
-  /// todo: test.
  const Tensor *input_x = param.InputX();
  auto input_x_ptr = input_x->data<float>();
  const auto &x_dims = input_x->dims();
@@ -46,50 +45,191 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
  auto scale_ptr = scale->data<float>();
  auto bias_ptr = bias->data<float>();

-  Tensor inv_std;
-  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
-  if (C != variance->numel()) {
-    DLOG << "C must equal to variance.numel()";
-  }
-  assert(C == variance->numel());

-  /// std = (var + epsilon).sqrt();
-  /// inv_std = 1 / std;
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
+  //  Tensor inv_std;
+  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
+
+  PADDLE_MOBILE_ENFORCE(C == variance->numel(),  "C must equal to variance.numel()");
+
+  int HXW = H * W;
+  if (HXW > 32) {
+    int NXC = N * C;
+    float *inv_std_ptr = new float[NXC * 4];
+    float * volatile new_scale_ptr = new float[NXC *  4];
+    float * volatile new_bias_ptr = new float[NXC * 4];
+
+    /// std = (var + epsilon).sqrt();
+    /// inv_std = 1 / std;
+    for (int i = 0; i < C * 4; i += 4) {
+      inv_std_ptr[i] =
+              1 / static_cast<float>(pow((variance_ptr[i/4] + epsilon), 0.5));
+      inv_std_ptr[i + 1] = inv_std_ptr[i];
+      inv_std_ptr[i + 2] = inv_std_ptr[i];
+      inv_std_ptr[i + 3] = inv_std_ptr[i];
+
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i/4];
+      new_scale_ptr[i + 1] = new_scale_ptr[i];
+      new_scale_ptr[i + 2] = new_scale_ptr[i];
+      new_scale_ptr[i + 3] = new_scale_ptr[i];
+
+      new_bias_ptr[i] = bias_ptr[i/4] - mean_ptr[i/4] * inv_std_ptr[i] * scale_ptr[i/4];
+
+      new_bias_ptr[i + 1] = new_bias_ptr[i];
+      new_bias_ptr[i + 2] = new_bias_ptr[i];
+      new_bias_ptr[i + 3] = new_bias_ptr[i];
+    }
+
+    for (int j = C * 4; j < NXC * 4; ++j) {
+      new_scale_ptr[j] = new_scale_ptr[j - C * 4];
+      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
+    }
+
+
+    asm volatile(
+    "subs %[N], %[N], #1                  \n\t"
+    "blt        end_n_%=                  \n\t"
+    "loop_n_%=:                           \n\t"
+
+    "subs %[C], %[C], #1                   \n\t"
+    "blt        end_c_%=                  \n\t"
+    "loop_c_%=:                           \n\t"
+
+    "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
+    "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
+
+    "mov r6, %[HXW]       \n\t"
+
+    "subs r6, r6, #32                       \n\t"
+    "blt        end_hw_%=                   \n\t"
+    "loop_hw_%=:                            \n\t"
+
+    "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+    "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+    "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+    "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"

-  Tensor new_scale;
-  auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
-  Tensor new_bias;
-  auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
-
-  /// ((x - est_mean) * (inv_var) * scale + bias equal to
-  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-    {
-      for (int n = 0; n < N; n++) {
-        for (int h = 0; h < H; h++) {
-          int tmp_index = n * stride0 + i * stride1 + h * stride2;
-          for (int w = 0; w < W; w++) {
-            int index = tmp_index + w;
-            out_ptr[index] =
-                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+    "vmul.f32   q1, q1,   q9  \n\t"
+    "vmul.f32   q2, q2,   q9  \n\t"
+    "vmul.f32   q3, q3,   q9  \n\t"
+    "vmul.f32   q4, q4,   q9  \n\t"
+
+    "vmul.f32   q5, q5,   q9  \n\t"
+    "vmul.f32   q6, q6,   q9  \n\t"
+    "vmul.f32   q7, q7,   q9  \n\t"
+    "vmul.f32   q8, q8,   q9  \n\t"
+
+    "vadd.f32   q1,  q1,  q10 \n\t"
+    "vadd.f32   q2, q2,   q10  \n\t"
+    "vadd.f32   q3, q3,   q10  \n\t"
+    "vadd.f32   q4,  q4,  q10 \n\t"
+    "vadd.f32   q5,  q5,  q10 \n\t"
+    "vadd.f32   q6,  q6,  q10 \n\t"
+    "vadd.f32   q7,  q7,  q10 \n\t"
+    "vadd.f32   q8,  q8,  q10 \n\t"
+
+    "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+    "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+    "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+    "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+
+    "subs r6, r6, #32                    \n\t"
+    "bge        loop_hw_%=                \n\t"
+    "end_hw_%=:                           \n\t"
+
+    "cmp  r6, #0                                \n\t"
+    "bge  end_remainder_%=                      \n\t"
+    "mov r5, #4                             \n\t"
+    "mul  r6, r6, r5                            \n\t"
+    "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
+
+    "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+    "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+    "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+    "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+    "vmul.f32   q1, q1,   q9  \n\t"
+    "vmul.f32   q2, q2,   q9  \n\t"
+    "vmul.f32   q3, q3,   q9  \n\t"
+    "vmul.f32   q4, q4,   q9  \n\t"
+    "vmul.f32   q5, q5,   q9  \n\t"
+    "vmul.f32   q6, q6,   q9  \n\t"
+    "vmul.f32   q7, q7,   q9  \n\t"
+    "vmul.f32   q8, q8,   q9  \n\t"
+    "vadd.f32   q1,  q1,  q10 \n\t"
+    "vadd.f32   q2, q2,   q10  \n\t"
+    "vadd.f32   q3, q3,   q10  \n\t"
+    "vadd.f32   q4,  q4,  q10 \n\t"
+    "vadd.f32   q5,  q5,  q10 \n\t"
+    "vadd.f32   q6,  q6,  q10 \n\t"
+    "vadd.f32   q7,  q7,  q10 \n\t"
+    "vadd.f32   q8,  q8,  q10 \n\t"
+
+    "add %[out_ptr], %[out_ptr], r6       \n\t"
+    "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+    "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+    "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+    "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+
+    "end_remainder_%=:                     \n\t"
+
+    "subs %[C], %[C], #1                    \n\t"
+    "bge        loop_c_%=                   \n\t"
+    "end_c_%=:                              \n\t"
+
+    "subs %[N], %[N], #1                  \n\t"
+    "bge        loop_n_%=                \n\t"
+    "end_n_%=:                           \n\t"
+    :
+    :[input_x_ptr]"r"(input_x_ptr), [out_ptr]"r"(out_ptr), [new_scale_ptr]"r"(new_scale_ptr), [new_bias_ptr]"r"(new_bias_ptr),
+    [N]"r"(N), [C]"r"(C), [HXW]"r"(HXW)
+    :"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "r5", "r6"
+    );
+
+    delete [] inv_std_ptr;
+    delete [] new_scale_ptr;
+    delete [] new_bias_ptr;
+
+  } else {
+    float *inv_std_ptr = new float[C];
+    for (int i = 0; i < C; i++) {
+      inv_std_ptr[i] =
+              1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+    }
+
+    Tensor new_scale;
+    auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
+    Tensor new_bias;
+    auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
+
+    /// ((x - est_mean) * (inv_var) * scale + bias equal to
+    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    for (int i = 0; i < C; i++) {
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+      new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+      {
+        for (int n = 0; n < N; n++) {
+          for (int h = 0; h < H; h++) {
+            int tmp_index = n * stride0 + i * stride1 + h * stride2;
+            for (int w = 0; w < W; w++) {
+              int index = tmp_index + w;
+              out_ptr[index] =
+                      input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+            }
          }
        }
      }
    }
+
+    delete [] inv_std_ptr;
+//    DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
+//    DLOG << "input_x_ptr : " << input_x_ptr[102];
+//    DLOG << "variance : " << variance_ptr[5];
+//    DLOG << "inv_std_ptr : " << inv_std_ptr[5];
+//    DLOG << "new_scale_ptr : " << new_scale_ptr[5];
+//    DLOG << "new_bias_ptr : " << new_bias_ptr[5];
+//    DLOG << "out_ptr : " << out_ptr[102];
  }
-  DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
-  DLOG << "input_x_ptr : " << input_x_ptr[102];
-  DLOG << "variance : " << variance_ptr[5];
-  DLOG << "inv_std_ptr : " << inv_std_ptr[5];
-  DLOG << "new_scale_ptr : " << new_scale_ptr[5];
-  DLOG << "new_bias_ptr : " << new_bias_ptr[5];
-  DLOG << "out_ptr : " << out_ptr[102];
+
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -38,70 +38,70 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  auto *out_ptr = out->mutable_data<float>();

  int numel = input_x->numel();
-  if (numel > 32) {
-    asm volatile(
-        "pld        [%[input_x_ptr], #0]        \n\t"
-        "vmov.f32   q8,    #0.0                 \n\t"
-        "subs %[num], %[num], #32                \n\t"
-        "blt        end_num_%=                  \n\t"
-        "loop_num_%=:                           \n\t"
-        "pld        [%[input_x_ptr], #1024]      \n\t"
-
-        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-
-        "vmax.f32 q0, q0, q8                   \n\t"
-        "vmax.f32 q1, q1, q8                    \n\t"
-        "vmax.f32 q2, q2, q8                   \n\t"
-        "vmax.f32 q3, q3, q8                   \n\t"
-        "vmax.f32 q4, q4, q8                   \n\t"
-        "vmax.f32 q5, q5, q8                   \n\t"
-        "vmax.f32 q6, q6, q8                   \n\t"
-        "vmax.f32 q7, q7, q8                   \n\t"
-
-        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-
-        "subs %[num], %[num], #32              \n\t"
-        "bge        loop_num_%=                \n\t"
-        "end_num_%=:                           \n\t"
-        "cmp %[num], #0                         \n\t"
-        "bge   end_%=                          \n\t"
-        "mov r6, #4                             \n\t"
-        "mul r5, %[num], r6                     \n\t"
-        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-        "vmax.f32 q0, q0, q8                   \n\t"
-        "vmax.f32 q1, q1, q8                    \n\t"
-        "vmax.f32 q2, q2, q8                   \n\t"
-        "vmax.f32 q3, q3, q8                   \n\t"
-        "vmax.f32 q4, q4, q8                   \n\t"
-        "vmax.f32 q5, q5, q8                   \n\t"
-        "vmax.f32 q6, q6, q8                   \n\t"
-        "vmax.f32 q7, q7, q8                   \n\t"
-        "add %[out_ptr], %[out_ptr], r5       \n\t"
-        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-        "end_%=:                                \n\t"
-        :
-        :
-        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
-          "r6");
-  } else {
+//  if (numel > 64) {
+//    asm volatile(
+//        "pld        [%[input_x_ptr], #0]        \n\t"
+//        "vmov.f32   q8,    #0.0                 \n\t"
+//        "subs %[num], %[num], #32                \n\t"
+//        "blt        end_num_%=                  \n\t"
+//        "loop_num_%=:                           \n\t"
+//        "pld        [%[input_x_ptr], #1024]      \n\t"
+//
+//        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+//        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+//        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+//        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+//
+//        "vmax.f32 q0, q0, q8                   \n\t"
+//        "vmax.f32 q1, q1, q8                    \n\t"
+//        "vmax.f32 q2, q2, q8                   \n\t"
+//        "vmax.f32 q3, q3, q8                   \n\t"
+//        "vmax.f32 q4, q4, q8                   \n\t"
+//        "vmax.f32 q5, q5, q8                   \n\t"
+//        "vmax.f32 q6, q6, q8                   \n\t"
+//        "vmax.f32 q7, q7, q8                   \n\t"
+//
+//        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+//        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+//        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+//        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+//
+//        "subs %[num], %[num], #32              \n\t"
+//        "bge        loop_num_%=                \n\t"
+//        "end_num_%=:                           \n\t"
+//        "cmp %[num], #0                         \n\t"
+//        "bge   end_%=                          \n\t"
+//        "mov r6, #4                             \n\t"
+//        "mul r5, %[num], r6                     \n\t"
+//        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+//        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+//        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+//        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+//        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+//        "vmax.f32 q0, q0, q8                   \n\t"
+//        "vmax.f32 q1, q1, q8                    \n\t"
+//        "vmax.f32 q2, q2, q8                   \n\t"
+//        "vmax.f32 q3, q3, q8                   \n\t"
+//        "vmax.f32 q4, q4, q8                   \n\t"
+//        "vmax.f32 q5, q5, q8                   \n\t"
+//        "vmax.f32 q6, q6, q8                   \n\t"
+//        "vmax.f32 q7, q7, q8                   \n\t"
+//        "add %[out_ptr], %[out_ptr], r5       \n\t"
+//        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+//        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+//        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+//        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+//        "end_%=:                                \n\t"
+//        :
+//        :
+//        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
+//        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
+//          "r6");
+//  } else {
    ReluFunctor<float> func_;
    math::Transform trans;
    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  }
+//  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -137,4 +137,6 @@ else ()
    ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-depthwise-conv-op paddle-mobile)

+    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
+
 endif()
--- a/test/common/test_lib_size.h
+++ b/test/common/test_lib_size.h
@@ -20,6 +20,8 @@ limitations under the License. */
 #define PADDLE_MOBILE_TEST_LIB_SIZE_H

 #include <vector>
+#include <pthread.h>
+#include <thread>
 //#include <list>
 //#include <tuple>
 //#include <typeinfo>
@@ -33,7 +35,7 @@ limitations under the License. */

 //#include <iostream>
 //#include <sstream>
-#include <memory>
+//#include <memory>
 //#include <stdio.h>
 //#include <cstring>

@@ -44,8 +46,10 @@ void foo() {

  //  std::cout << "12345" << std::endl;
  std::vector<int> vec = {1, 2, 3, 4, 5};
+  vec.push_back(2);

-  //  std::find(vec.begin(), vec.end(), 1);
+  pthread_mutex_init(NULL, NULL);
+  pthread_attr_destroy(NULL);
  //  std::find(vec.begin(), vec.end(), 1);

  //  std::list<int> l;
@@ -70,7 +74,7 @@ void foo() {
  //    int z = 10;
  //  }

-  std::shared_ptr<int> s1 = std::make_shared<int>();
+//  std::shared_ptr<int> s1 = std::make_shared<int>();

  //  std::stringstream ss;
  //  ss << "12345";

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -19,11 +19,9 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet, true, true);
+  auto program = loader.Load(g_mobilenet_ssd, false, false);
  //  loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
-  //  "/params",
-  //              true);
-
+  //  "/params", true);
  program.originProgram->Description("program desc: ");
  return 0;
 }
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -18,7 +18,7 @@ limitations under the License. */

 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  bool optimize = false;
+  bool optimize = true;
  auto time1 = time();
  auto program = loader.Load(g_googlenet, optimize);
  //  auto program = loader.Load(g_googlenet_combine + "/model",

--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -41,7 +41,7 @@ class TestBatchNormOp {
      for (int j = 0; j < ops.size(); ++j) {
        std::shared_ptr<OpDesc> op = ops[j];
        if (op->Type() == "batch_norm" &&
-            op->Input("X")[0] == "conv2d_0.tmp_0") {
+            op->Input("X")[0] == "conv2d_5.tmp_0") {
          DLOG << " mul attr size: " << op->GetAttrMap().size();
          DLOG << " inputs size: " << op->GetInputs().size();
          DLOG << " outputs size: " << op->GetOutputs().size();
@@ -67,29 +67,29 @@ class TestBatchNormOp {
                                     const Tensor &t5) {
    // feed
    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0");
+    Variable *x1_feed_value = scope->Var("conv2d_5.tmp_0");
    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);

-    Variable *mean_feed_value = scope->Var("batch_norm_0.w_1");
+    Variable *mean_feed_value = scope->Var("batch_norm_10.w_1");
    auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
    tensor_mean->ShareDataWith(t2);

-    Variable *scale_feed_value = scope->Var("batch_norm_0.w_0");
+    Variable *scale_feed_value = scope->Var("batch_norm_10.w_0");
    auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
    tensor_scale->ShareDataWith(t3);

-    Variable *variance_feed_value = scope->Var("batch_norm_0.w_2");
+    Variable *variance_feed_value = scope->Var("batch_norm_10.w_2");
    auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
    tensor_variance->ShareDataWith(t4);

-    Variable *bias_feed_value = scope->Var("batch_norm_0.b_0");
+    Variable *bias_feed_value = scope->Var("batch_norm_10.b_0");
    auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
    tensor_bias->ShareDataWith(t5);

-    Variable *output = scope->Var("batch_norm_0.tmp_2");
+    Variable *output = scope->Var("batch_norm_10.tmp_2");
    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({4, 10, 2, 2});
+    output_tensor->mutable_data<float>({1, 256, 38, 38});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();

@@ -128,30 +128,30 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BatchNormOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_resnet));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));

  /// input x (4,10,2,2)
  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0),
+  SetupTensor<float>(&inputx1, {1, 256, 38, 38}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *inputx1_ptr = inputx1.data<float>();

  paddle_mobile::framework::Tensor mean;
-  SetupTensor<float>(&mean, {10}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&mean, {256}, static_cast<float>(0), static_cast<float>(1));
  auto *mean_ptr = mean.data<float>();

  paddle_mobile::framework::Tensor scale;
-  SetupTensor<float>(&scale, {10}, static_cast<float>(0),
+  SetupTensor<float>(&scale, {256}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *scale_ptr = scale.data<float>();

  paddle_mobile::framework::Tensor variance;
-  SetupTensor<float>(&variance, {10}, static_cast<float>(0),
+  SetupTensor<float>(&variance, {256}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *variance_ptr = variance.data<float>();

  paddle_mobile::framework::Tensor bias;
-  SetupTensor<float>(&bias, {10}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&bias, {256}, static_cast<float>(0), static_cast<float>(1));
  auto *bias_ptr = bias.data<float>();

  paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
@@ -161,11 +161,13 @@ int main() {
      testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias);
  auto *output_bn_ptr = output_bn->data<float>();

-  /// [2, 5, 1, 0]
-  DLOG << " (" << inputx1_ptr[102] << " - " << mean_ptr[5] << ")/(("
-       << variance_ptr[5] << " + 0.00001"
-       << ")^0.5)* " << scale_ptr[5] << " + " << bias_ptr[5] << " = ";
-  DLOG << output_bn_ptr[102];
+  DLOG << " (" << inputx1_ptr[0] << " - " << mean_ptr[0] << ")/(("
+       << variance_ptr[0] << " + 0.00001"
+       << ")^0.5)* " << scale_ptr[0] << " + " << bias_ptr[0] << " = ";
+  DLOG << output_bn_ptr[0];
+
+  DLOG << "input_ptr 0 : " << inputx1_ptr[0];
+  DLOG << "output_ptr 0 : " << output_bn_ptr[0];

  return 0;
 }
--- a/tools/push2android.sh
+++ b/tools/push2android.sh
+#!/usr/bin/env sh
+
+push_fn () {
+MODELS_PATH="../test/models/*"
+MODELS_SRC="../test/models"
+IMAGE_PATH="../test/images/*"
+EXE_FILE="../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+adb shell mkdir ${EXE_DIR}
+MODELS_DIR="data/local/tmp/models"
+adb shell mkdir ${MODELS_DIR}
+for file in `ls ${MODELS_SRC}`
+do 
+    adb shell mkdir ${MODELS_DIR}"/"${file}
+done
+
+IMAGES_DIR="data/local/tmp/images"
+adb shell mkdir ${IMAGES_DIR}
+LIB_PATH="../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
+adb push ${IMAGE_PATH} ${IMAGES_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
+}
+
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
+push_fn
+fi
--- a/tools/run.sh
+++ b/tools/run.sh
@@ -24,8 +24,15 @@ adb shell mkdir ${IMAGES_DIR}
 LIB_PATH="../build/release/arm-v7a/build/*"
 adb push ${EXE_FILE} ${EXE_DIR}
 adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
 adb push ${IMAGE_PATH} ${IMAGES_DIR}
 adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
 adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${TESTUNIT}"
 }
+
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
 push_fn
+fi
\ No newline at end of file
--- a/tools/scripts/run_on_android.sh
+++ b/tools/scripts/run_on_android.sh
@@ -19,12 +19,19 @@ adb shell mkdir ${IMAGES_DIR}
 LIB_PATH="../../build/release/arm-v7a/build/*"
 adb push ${EXE_FILE} ${EXE_DIR}
 adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
 adb push ${IMAGE_PATH} ${IMAGES_DIR}
 adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
 echo "test-op or test-net below : "
 adb shell ls /data/local/tmp/bin
 echo "**** choose OP or NET to test ****"
 read -p "which to test : " test_name
 adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
 }
+
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
 push_fn
+fi
\ No newline at end of file