Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into HEAD

ace19269 · superjomn · 0d6c897b · 8749f4fc · ace19269 · ace19269
52 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+before_script:
+  - env
+
+image: $SERVER_LITE_DOCKER_IMAGE
+
+stages:
+  - ci
+  - build_server
+  - build_mobile
+
+check:prebuilt:
+    tags:
+        - lite
+    stage: ci
+    script:
+        #- pip3 install pre-commit
+        #- alias python=python3
+        - rm -rf ~/.pip
+        - pip install pre-commit
+        - pre-commit install
+        - ./paddle/fluid/lite/tools/build.sh check_style
+        #- ./paddle/fluid/lite/tools/build.sh check_need_ci
+    cache:
+        key: check_style
+        paths:
+            - /root/.cache
+
+build:server:
+    tags:
+        - lite
+    image: $SERVER_LITE_DOCKER_IMAGE
+    stage: build_server
+    cache:
+        key: server_thirdparty
+        paths:
+            - build/third_party
+            - /root/.ccache
+    script:
+        - apt install ccache
+        - export http_proxy=http://172.19.57.45:3128
+        - export https_proxy=http://172.19.57.45:3128
+        #- export http_proxy=http://agent.baidu.com:8118
+        #- export https_proxy=http://agent.baidu.com:8118
+        - mkdir -p build
+        - cd build
+        - ../paddle/fluid/lite/tools/build.sh cmake_x86
+        - make extern_eigen3
+        - make extern_boost
+        - make framework_proto
+        - make extern_warpctc
+        - cd ..
+        - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/build/third_party/install/mklml/lib
+        - ./paddle/fluid/lite/tools/build.sh build_test_server
+    dependencies:
+        - check:prebuilt
+
+build:mobile:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - /root/.ccache
+    script:
+        - apt install ccache
+        - export http_proxy=http://172.19.57.45:3128
+        - export https_proxy=http://172.19.57.45:3128
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm
+    dependencies:
+        - build:server
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,6 +166,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    #include(external/zlib)     # download, build, install gtest
    include(external/protobuf)  # download, build, install protobuf
    include(external/eigen)     # download eigen3
+    include(ccache)             # set ccache for compilation

    include(generic)            # simplify cmake module
    include(configure)          # add paddle env configuration

--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -172,3 +172,4 @@ add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
 add_subdirectory(gen_code)
+ 
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -14,7 +14,7 @@ if(LITE_WITH_CUDA)
    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()

-cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
+#cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})

 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")

--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -66,7 +66,7 @@ USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);

 USE_LITE_OP(con2d);
-USE_LITE_OP(batch_norm);
+// USE_LITE_OP(batch_norm);
 USE_LITE_OP(relu);
 USE_LITE_OP(depthwise_conv2d);
 USE_LITE_OP(pool2d);

--- a/paddle/fluid/lite/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/CMakeLists.txt

 add_subdirectory(math)
+ 
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -32,5 +32,7 @@ cc_library(math_arm SRCS
    conv_winograd_3x3.cc
    conv_winograd.cc
    split.cc
-    DEPS ${lite_kernel_deps} eigen3)
+    DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
+  # TODO(TJ): fix me do not deps proto
+ 
 
--- a/paddle/fluid/lite/arm/math/scale.cc
+++ b/paddle/fluid/lite/arm/math/scale.cc
@@ -58,6 +58,111 @@ void scale<float>(const float* din, float* dout, int num, float scale,
  }
 }

+template <>
+void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
+                  int inner_dim, const float* scale_data,
+                  const float* bias_data) {
+  int cnt = inner_dim >> 4;
+  int remain = inner_dim % 16;
+  int size = inner_dim * scale_dim;
+  for (int n = 0; n < outer_dim; n++) {
+    const float* din_ptr_n = din + n * size;
+    float* dout_ptr_n = dout + n * size;
+#pragma omp parallel for
+    for (int i = 0; i < scale_dim; i++) {
+      const float* din_ptr = din_ptr_n + i * inner_dim;
+      float* dout_ptr = dout_ptr_n + i * inner_dim;
+      float scale = scale_data[i];
+      float32x4_t vscale = vdupq_n_f32(scale);
+      float bias = bias_data[i];
+      float32x4_t vbias = vdupq_n_f32(bias);
+      for (int j = 0; j < cnt; j++) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        float32x4_t din2 = vld1q_f32(din_ptr + 8);
+        float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+        float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
+        float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
+        float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
+        float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
+
+        din_ptr += 16;
+        vst1q_f32(dout_ptr, vsum1);
+        vst1q_f32(dout_ptr + 4, vsum2);
+        vst1q_f32(dout_ptr + 8, vsum3);
+        vst1q_f32(dout_ptr + 12, vsum4);
+
+        dout_ptr += 16;
+      }
+      for (int j = 0; j < remain; j++) {
+        *dout_ptr = *din_ptr * scale + bias;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
+template <>
+void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
+                  const float* scale_data, const float* bias_data) {
+  int cnt = scale_dim >> 4;
+  int remain = scale_dim % 16;
+  for (int n = 0; n < outer_dim; n++) {
+    const float* din_ptr_n = din + n * scale_dim;
+    float* dout_ptr_n = dout + n * scale_dim;
+#pragma omp parallel for
+    for (int i = 0; i < cnt; i++) {
+      int idx = i << 4;
+      const float* din_ptr = din_ptr_n + idx;
+      const float* scale_ptr = scale_data + idx;
+      const float* bias_ptr = bias_data + idx;
+      float* dout_ptr = dout_ptr_n + idx;
+
+      float32x4_t din0 = vld1q_f32(din_ptr);
+      float32x4_t vscale0 = vld1q_f32(scale_ptr);
+      float32x4_t vbias0 = vld1q_f32(bias_ptr);
+
+      float32x4_t din1 = vld1q_f32(din_ptr + 4);
+      float32x4_t vscale1 = vld1q_f32(scale_ptr + 4);
+      float32x4_t vbias1 = vld1q_f32(bias_ptr + 4);
+
+      float32x4_t din2 = vld1q_f32(din_ptr + 8);
+      float32x4_t vscale2 = vld1q_f32(scale_ptr + 8);
+      float32x4_t vbias2 = vld1q_f32(bias_ptr + 8);
+
+      float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0);
+      float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1);
+
+      float32x4_t din3 = vld1q_f32(din_ptr + 12);
+      float32x4_t vscale3 = vld1q_f32(scale_ptr + 12);
+      float32x4_t vbias3 = vld1q_f32(bias_ptr + 12);
+
+      vst1q_f32(dout_ptr, vsum1);
+      vst1q_f32(dout_ptr + 4, vsum2);
+
+      float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2);
+      float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3);
+
+      vst1q_f32(dout_ptr + 8, vsum3);
+      vst1q_f32(dout_ptr + 12, vsum4);
+    }
+    int idx = cnt << 4;
+    const float* din_ptr = din_ptr_n + idx;
+    float* dout_ptr = dout_ptr_n + idx;
+    const float* scale_ptr = scale_data + idx;
+    const float* bias_ptr = bias_data + idx;
+    for (int j = 0; j < remain; j++) {
+      *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr);
+      dout_ptr++;
+      din_ptr++;
+      scale_ptr++;
+      bias_ptr++;
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/paddle/fluid/lite/arm/math/scale.h
+++ b/paddle/fluid/lite/arm/math/scale.h
@@ -22,6 +22,14 @@ namespace math {
 template <typename T>
 void scale(const T* din, T* dout, int num, float scale, float bias);

+template <typename T>
+void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim,
+           const float* scale_data, const float* bias_data);
+
+template <typename T>
+void scale(const T* din, T* dout, int outer_dim, int scale_dim,
+           const float* scale_data, const float* bias_data);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/paddle/fluid/lite/arm/math/type_trans.cpp
+++ b/paddle/fluid/lite/arm/math/type_trans.cpp
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/lite/arm/math/saturate.h"
+#include "paddle/fluid/lite/arm/math/type_trans.h"
 #include <arm_neon.h>
 #include <string.h>
+#include "paddle/fluid/lite/arm/math/saturate.h"

 namespace paddle {
 namespace lite {
@@ -23,563 +24,553 @@ namespace math {

 template <typename dtype>
 void int32_to_dtype(const int* din, dtype* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size);
+                    int axis_size, int64_t outer_size, int64_t inner_size);

 void fp32_to_int8(const float* din, signed char* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-
-    int cnt = inner_size / 16;
-    int remain = inner_size & 15;
-    long long loop_size = outer_size * axis_size;
+                  int axis_size, int64_t outer_size, int64_t inner_size) {
+  int cnt = inner_size / 16;
+  int remain = inner_size & 15;
+  int64_t loop_size = outer_size * axis_size;

 #pragma omp parallel for
-    for (int j = 0; j < loop_size; ++j) {
-        float inv_scale = 1.f / scale[j % axis_size];
-        float32x4_t vzero = vdupq_n_f32(0.f);
-        float32x4_t vscale = vdupq_n_f32(inv_scale);
-        float32x4_t vpoff = vdupq_n_f32(0.5f);
-        float32x4_t vnoff = vdupq_n_f32(-0.5f);
-        const float* din_c = din + j * inner_size;
-        signed char* dout_c = dout + j * inner_size;
-        if (cnt > 0) {
-            int cnt_loop = cnt;
-            const float* din_ptr = din_c;
-            signed char* dout_ptr = dout_c;
+  for (int j = 0; j < loop_size; ++j) {
+    float inv_scale = 1.f / scale[j % axis_size];
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    float32x4_t vscale = vdupq_n_f32(inv_scale);
+    float32x4_t vpoff = vdupq_n_f32(0.5f);
+    float32x4_t vnoff = vdupq_n_f32(-0.5f);
+    const float* din_c = din + j * inner_size;
+    signed char* dout_c = dout + j * inner_size;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      const float* din_ptr = din_c;
+      signed char* dout_ptr = dout_c;
 #ifdef __aarch64__
-            asm volatile(
-            "ldp q0, q1, [%[in]], #32                           \n"
-                    "ldp q2, q3, [%[in]], #32                   \n"
-                    "0:                                         \n" /* main loop */
-                    "fmul v4.4s, v0.4s, %[scale].4s             \n"
-                    "fmul v5.4s, v1.4s, %[scale].4s             \n"
-                    "fmul v6.4s, v2.4s, %[scale].4s             \n"
-                    "fmul v7.4s, v3.4s, %[scale].4s             \n"
-                    "ldp q0, q1, [%[in]], #32                   \n"
-                    "subs %[cnt], %[cnt], #1                    \n"
-                    "FCVTAS v8.4s, v4.4s                        \n"
-                    "FCVTAS v9.4s, v5.4s                        \n"
-                    "FCVTAS v10.4s, v6.4s                       \n"
-                    "FCVTAS v11.4s, v7.4s                       \n"
-                    "ldp q2, q3, [%[in]], #32                   \n"
-                    "sqxtn    v4.4h, v8.4s                      \n"
-                    "sqxtn2   v4.8h, v9.4s                      \n"
-                    "sqxtn    v5.4h, v10.4s                     \n"
-                    "sqxtn2   v5.8h, v11.4s                     \n"
-                    "sqxtn    v8.8b, v4.8h                      \n"
-                    "sqxtn2   v8.16b, v5.8h                     \n"
-                    "str q8, [%[out]], #16                      \n"
-                    "bne    0b                                  \n"
-            : [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop)
-            : [scale] "w" (vscale)
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
-            );
+      asm volatile(
+          "ldp q0, q1, [%[in]], #32                           \n"
+          "ldp q2, q3, [%[in]], #32                   \n"
+          "0:                                         \n" /* main loop */
+          "fmul v4.4s, v0.4s, %[scale].4s             \n"
+          "fmul v5.4s, v1.4s, %[scale].4s             \n"
+          "fmul v6.4s, v2.4s, %[scale].4s             \n"
+          "fmul v7.4s, v3.4s, %[scale].4s             \n"
+          "ldp q0, q1, [%[in]], #32                   \n"
+          "subs %[cnt], %[cnt], #1                    \n"
+          "FCVTAS v8.4s, v4.4s                        \n"
+          "FCVTAS v9.4s, v5.4s                        \n"
+          "FCVTAS v10.4s, v6.4s                       \n"
+          "FCVTAS v11.4s, v7.4s                       \n"
+          "ldp q2, q3, [%[in]], #32                   \n"
+          "sqxtn    v4.4h, v8.4s                      \n"
+          "sqxtn2   v4.8h, v9.4s                      \n"
+          "sqxtn    v5.4h, v10.4s                     \n"
+          "sqxtn2   v5.8h, v11.4s                     \n"
+          "sqxtn    v8.8b, v4.8h                      \n"
+          "sqxtn2   v8.16b, v5.8h                     \n"
+          "str q8, [%[out]], #16                      \n"
+          "bne    0b                                  \n"
+          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
+          : [scale] "w"(vscale)
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+            "v11");
 #else
-            asm volatile(
-            "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
-                    "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
-                    "0:                                     @ main loop\n"
-                    "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-                    "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-                    "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-                    "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-                    "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
-                    "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
-                    "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
-                    "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
-                    "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-                    "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
-                    "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-                    "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
-                    "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
-                    "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
-                    "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
-                    "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
-                    "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
-                    "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
-                    "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
-                    "vcvt.s32.f32  q3, q7                   @ cvt to int32\n"
-                    "vqmovn.s32 d8, q0                      @ cnt to int16\n"
-                    "vqmovn.s32 d9, q1                      @ cnt to int16\n"
-                    "vqmovn.s32 d10, q2                     @ cnt to int16\n"
-                    "vqmovn.s32 d11, q3                     @ cnt to int16\n"
-                    "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
-                    "vqmovn.s16 d12, q4                     @ cnt to int8\n"
-                    "vqmovn.s16 d13, q5                     @ cnt to int8\n"
-                    "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
-                    "vst1.32    {d12-d13},  [%[dout]]!      @ write to output\n"
-                    "subs   %[cnt], #1                      @ loop count -1\n"
-                    "bne    0b                              @ to main loop\n"
-
-            :[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop)
-            :[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero)
-            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"
-            );
+      asm volatile(
+          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
+          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
+          "0:                                     @ main loop\n"
+          "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
+          "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
+          "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
+          "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
+          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
+          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
+          "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
+          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
+          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
+          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
+          "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
+          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
+          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
+          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
+          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
+          "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
+          "vcvt.s32.f32  q3, q7                   @ cvt to int32\n"
+          "vqmovn.s32 d8, q0                      @ cnt to int16\n"
+          "vqmovn.s32 d9, q1                      @ cnt to int16\n"
+          "vqmovn.s32 d10, q2                     @ cnt to int16\n"
+          "vqmovn.s32 d11, q3                     @ cnt to int16\n"
+          "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
+          "vqmovn.s16 d12, q4                     @ cnt to int8\n"
+          "vqmovn.s16 d13, q5                     @ cnt to int8\n"
+          "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
+          "vst1.32    {d12-d13},  [%[dout]]!      @ write to output\n"
+          "subs   %[cnt], #1                      @ loop count -1\n"
+          "bne    0b                              @ to main loop\n"
+
+          : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
+          : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
+            [vzero] "w"(vzero)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+            "q11");
 #endif
-        }
-        const float* din_r = din_c + 16 * cnt;
-        signed char* dout_r = dout_c + 16 * cnt;
-        for (int i = 0; i < remain; ++i) {
-            dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
-        }
    }
+    const float* din_r = din_c + 16 * cnt;
+    signed char* dout_r = dout_c + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
+    }
+  }
 }

 void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-
-    int cnt = inner_size / 8;
-    int remain = inner_size & 7;
-    long long loop_size = outer_size * axis_size;
+                   int axis_size, int64_t outer_size, int64_t inner_size) {
+  int cnt = inner_size / 8;
+  int remain = inner_size & 7;
+  int64_t loop_size = outer_size * axis_size;

 #pragma omp parallel for
-    for (int j = 0; j < loop_size; ++j) {
-        float inv_scale = 1.f / scale[j % axis_size];
-        float32x4_t vzero = vdupq_n_f32(0.f);
-        float32x4_t vscale = vdupq_n_f32(inv_scale);
-        float32x4_t vpoff = vdupq_n_f32(0.5f);
-        float32x4_t vnoff = vdupq_n_f32(-0.5f);
-        const float* din_c = din + j * inner_size;
-        int16_t* dout_c = dout + j * inner_size;
-        if (cnt > 0) {
-            int cnt_loop = cnt;
-            const float* din_ptr = din_c;
-            int16_t* dout_ptr = dout_c;
+  for (int j = 0; j < loop_size; ++j) {
+    float inv_scale = 1.f / scale[j % axis_size];
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    float32x4_t vscale = vdupq_n_f32(inv_scale);
+    float32x4_t vpoff = vdupq_n_f32(0.5f);
+    float32x4_t vnoff = vdupq_n_f32(-0.5f);
+    const float* din_c = din + j * inner_size;
+    int16_t* dout_c = dout + j * inner_size;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      const float* din_ptr = din_c;
+      int16_t* dout_ptr = dout_c;
 #ifdef __aarch64__
-            asm volatile(
-                "ldp q0, q1, [%[in]], #32                   \n"
-                "0:                                         \n" /* main loop */
-                "fmul v4.4s, v0.4s, %[scale].4s             \n"
-                "fmul v5.4s, v1.4s, %[scale].4s             \n"
-                "ldp q0, q1, [%[in]], #32                   \n"
-                "subs %[cnt], %[cnt], #1                    \n"
-                "FCVTAS v8.4s, v4.4s                        \n"
-                "FCVTAS v9.4s, v5.4s                        \n"
-                "sqxtn    v4.4h, v8.4s                      \n"
-                "sqxtn2   v4.8h, v9.4s                      \n"
-                "str q4, [%[out]], #16                      \n"
-                "bne    0b                                  \n"
-            : [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop)
-            : [scale] "w" (vscale)
-            : "v0", "v1", "v4", "v5", "v8", "v9"
-            );
+      asm volatile(
+          "ldp q0, q1, [%[in]], #32                   \n"
+          "0:                                         \n" /* main loop */
+          "fmul v4.4s, v0.4s, %[scale].4s             \n"
+          "fmul v5.4s, v1.4s, %[scale].4s             \n"
+          "ldp q0, q1, [%[in]], #32                   \n"
+          "subs %[cnt], %[cnt], #1                    \n"
+          "FCVTAS v8.4s, v4.4s                        \n"
+          "FCVTAS v9.4s, v5.4s                        \n"
+          "sqxtn    v4.4h, v8.4s                      \n"
+          "sqxtn2   v4.8h, v9.4s                      \n"
+          "str q4, [%[out]], #16                      \n"
+          "bne    0b                                  \n"
+          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
+          : [scale] "w"(vscale)
+          : "v0", "v1", "v4", "v5", "v8", "v9");
 #else
-            asm volatile(
-                "vld1.32 {d0-d3}, [%[din]]!             @ load in0~in7\n"
-                "0:                                     @ main loop\n"
-                "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-                "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-                "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-                "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-                "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
-                "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
-                "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-                "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
-                "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
-                "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
-                "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
-                "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
-                "vqmovn.s32 d8, q0                      @ cnt to int16\n"
-                "vqmovn.s32 d9, q1                      @ cnt to int16\n"
-                "vld1.32 {d0-d3},  [%[din]]!            @ load in0~in7\n"
-                "vst1.32 {d8-d9},  [%[dout]]!           @ write to output\n"
-                "subs   %[cnt], #1                      @ loop count -1\n"
-                "bne    0b                              @ to main loop\n"
-
-            :[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop)
-            :[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero)
-            :"q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"
-            );
+      asm volatile(
+          "vld1.32 {d0-d3}, [%[din]]!             @ load in0~in7\n"
+          "0:                                     @ main loop\n"
+          "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
+          "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
+          "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
+          "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
+          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
+          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
+          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
+          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
+          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
+          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
+          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
+          "vqmovn.s32 d8, q0                      @ cnt to int16\n"
+          "vqmovn.s32 d9, q1                      @ cnt to int16\n"
+          "vld1.32 {d0-d3},  [%[din]]!            @ load in0~in7\n"
+          "vst1.32 {d8-d9},  [%[dout]]!           @ write to output\n"
+          "subs   %[cnt], #1                      @ loop count -1\n"
+          "bne    0b                              @ to main loop\n"
+
+          : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
+          : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
+            [vzero] "w"(vzero)
+          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
 #endif
-        }
-        const float* din_r = din_c + 8 * cnt;
-        int16_t* dout_r = dout_c + 8 * cnt;
-        for (int i = 0; i < remain; ++i) {
-            dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
-        }
    }
+    const float* din_r = din_c + 8 * cnt;
+    int16_t* dout_r = dout_c + 8 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
+    }
+  }
 }

 void int8_to_fp32(const signed char* in, float* out, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-
-    int cnt = inner_size / 16;
-    int remain = inner_size & 15;
-    long long loop_size = axis_size * outer_size;
+                  int axis_size, int64_t outer_size, int64_t inner_size) {
+  int cnt = inner_size / 16;
+  int remain = inner_size & 15;
+  int64_t loop_size = axis_size * outer_size;
 #pragma omp parallel for
-    for (long long n = 0; n < loop_size; ++n) {
-        float in_scale = scale[n % axis_size];
-        const signed char* din_c = in + n * inner_size;
-        float* dout_c = out + n * inner_size;
-        float32x4_t vscale = vdupq_n_f32(in_scale);
-        if (cnt > 0) {
-            int loop = cnt;
-            const signed char* din_ptr = din_c;
-            float* dout_ptr = dout_c;
+  for (int64_t n = 0; n < loop_size; ++n) {
+    float in_scale = scale[n % axis_size];
+    const signed char* din_c = in + n * inner_size;
+    float* dout_c = out + n * inner_size;
+    float32x4_t vscale = vdupq_n_f32(in_scale);
+    if (cnt > 0) {
+      int loop = cnt;
+      const signed char* din_ptr = din_c;
+      float* dout_ptr = dout_c;
 #ifdef __aarch64__
-            asm volatile(
-            "ldp     d0, d1, [%[in]], #16               \n" /* load 16 int8*/
-                    "0:                                 \n" /* main loop */
-                    "sshll   v2.8h, v0.8b, #0           \n" /* trans to int16*/
-                    "sshll   v3.8h, v1.8b, #0           \n" /* trans to int16*/
-
-                    "sshll   v4.4s, v2.4h, #0           \n" /* trans to int32*/
-                    "sshll2  v5.4s, v2.8h, #0           \n" /* trans to int32*/
-                    "sshll   v6.4s, v3.4h, #0           \n" /* trans to int32*/
-                    "sshll2  v7.4s, v3.8h, #0           \n" /* trans to int32*/
-
-                    "ldp     d0, d1, [%[in]], #16       \n" /* load 16 int8*/
-
-                    "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
-                    "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
-                    "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
-                    "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
-
-                    "subs    %[loop], %[loop], #1       \n"
-
-                    "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
-                    "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
-                    "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
-                    "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
-
-                    "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
-                    "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
-
-                    "bne     0b                         \n"
-                 :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-                 :[scale] "w" (vscale)
-                 :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
-            );
+      asm volatile(
+          "ldp     d0, d1, [%[in]], #16               \n" /* load 16 int8*/
+          "0:                                 \n"         /* main loop */
+          "sshll   v2.8h, v0.8b, #0           \n"         /* trans to int16*/
+          "sshll   v3.8h, v1.8b, #0           \n"         /* trans to int16*/
+
+          "sshll   v4.4s, v2.4h, #0           \n" /* trans to int32*/
+          "sshll2  v5.4s, v2.8h, #0           \n" /* trans to int32*/
+          "sshll   v6.4s, v3.4h, #0           \n" /* trans to int32*/
+          "sshll2  v7.4s, v3.8h, #0           \n" /* trans to int32*/
+
+          "ldp     d0, d1, [%[in]], #16       \n" /* load 16 int8*/
+
+          "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
+          "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
+          "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
+          "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
+
+          "subs    %[loop], %[loop], #1       \n"
+
+          "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
+          "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
+          "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
+          "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
+
+          "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
+          "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
+
+          "bne     0b                         \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+            "v11");
 #else
-            asm volatile(
-            "vld1.32    {d0-d1},    [%[in]]!            @ load 16 int8\n"
-                    "0:                                 @ main loop\n"
-                    "vmovl.s8      q2, d0               @ trans to int16\n"
-                    "vmovl.s8      q3, d1               @ trans to int16\n"
-                    "vmovl.s16     q4, d4               @ trans to int32\n"
-                    "vmovl.s16     q5, d5               @ trans to int32\n"
-                    "vmovl.s16     q6, d6               @ trans to int32\n"
-                    "vmovl.s16     q7, d7               @ trans to int32\n"
-                    "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
-                    "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
-                    "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
-                    "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
-                    "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
-                    "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
-                    "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
-                    "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
-
-                    "vld1.32    {d0-d1},    [%[in]]!    @ load 16 int8\n"
-
-                    "subs          %[loop], #1            \n"
-
-                    "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
-                    "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
-
-                    "bne           0b                     \n"
-            :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-            :[scale] "w" (vscale)
-            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
-            );
-#endif //__aarch64__
-        }
-        const signed char* din_r = din_c + 16 * cnt;
-        float* dout_r = dout_c + 16 * cnt;
-        for (int i = 0; i < remain; ++i) {
-            dout_r[i] = in_scale * din_r[i];
-        }
+      asm volatile(
+          "vld1.32    {d0-d1},    [%[in]]!            @ load 16 int8\n"
+          "0:                                 @ main loop\n"
+          "vmovl.s8      q2, d0               @ trans to int16\n"
+          "vmovl.s8      q3, d1               @ trans to int16\n"
+          "vmovl.s16     q4, d4               @ trans to int32\n"
+          "vmovl.s16     q5, d5               @ trans to int32\n"
+          "vmovl.s16     q6, d6               @ trans to int32\n"
+          "vmovl.s16     q7, d7               @ trans to int32\n"
+          "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
+          "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
+          "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
+          "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
+          "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
+          "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
+          "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
+          "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
+
+          "vld1.32    {d0-d1},    [%[in]]!    @ load 16 int8\n"
+
+          "subs          %[loop], #1            \n"
+
+          "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
+          "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
+
+          "bne           0b                     \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+#endif  // __aarch64__
+    }
+    const signed char* din_r = din_c + 16 * cnt;
+    float* dout_r = dout_c + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      dout_r[i] = in_scale * din_r[i];
    }
+  }
 }

-void int16_to_fp32(const short* in, float* out, const float* scale,
-        int axis_size, long long outer_size, long long inner_size) {
-
-    int cnt = inner_size / 16;
-    int remain = inner_size & 15;
-    long long loop_size = axis_size * outer_size;
+void int16_to_fp32(const int16_t* in, float* out, const float* scale,
+                   int axis_size, int64_t outer_size, int64_t inner_size) {
+  int cnt = inner_size / 16;
+  int remain = inner_size & 15;
+  int64_t loop_size = axis_size * outer_size;
 #pragma omp parallel for
-    for (long long n = 0; n < loop_size; ++n) {
-        float in_scale = scale[n % axis_size];
-        const short* din_c = in + n * inner_size;
-        float* dout_c = out + n * inner_size;
-        float32x4_t vscale = vdupq_n_f32(in_scale);
-        if (cnt > 0) {
-            int loop = cnt;
-            const short* din_ptr = din_c;
-            float* dout_ptr = dout_c;
+  for (int64_t n = 0; n < loop_size; ++n) {
+    float in_scale = scale[n % axis_size];
+    const int16_t* din_c = in + n * inner_size;
+    float* dout_c = out + n * inner_size;
+    float32x4_t vscale = vdupq_n_f32(in_scale);
+    if (cnt > 0) {
+      int loop = cnt;
+      const int16_t* din_ptr = din_c;
+      float* dout_ptr = dout_c;
 #ifdef __aarch64__
-            asm volatile(
-            "ldp     q0, q1, [%[in]], #32               \n" /* load 16 int16*/
-                    "0:                                 \n" /* main loop */
-                    "sshll   v4.4s, v0.4h, #0           \n" /* trans to int32*/
-                    "sshll2  v5.4s, v0.8h, #0           \n" /* trans to int32*/
-                    "sshll   v6.4s, v1.4h, #0           \n" /* trans to int32*/
-                    "sshll2  v7.4s, v1.8h, #0           \n" /* trans to int32*/
-
-                    "ldp     q0, q1, [%[in]], #32       \n" /* load 16 int16*/
-
-                    "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
-                    "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
-                    "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
-                    "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
-
-                    "subs    %[loop], %[loop], #1       \n"
-
-                    "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
-                    "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
-                    "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
-                    "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
-
-                    "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
-                    "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
-
-                    "bne     0b                         \n"
-                 :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-                 :[scale] "w" (vscale)
-                 :"v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
-            );
+      asm volatile(
+          "ldp     q0, q1, [%[in]], #32               \n" /* load 16 int16*/
+          "0:                                 \n"         /* main loop */
+          "sshll   v4.4s, v0.4h, #0           \n"         /* trans to int32*/
+          "sshll2  v5.4s, v0.8h, #0           \n"         /* trans to int32*/
+          "sshll   v6.4s, v1.4h, #0           \n"         /* trans to int32*/
+          "sshll2  v7.4s, v1.8h, #0           \n"         /* trans to int32*/
+
+          "ldp     q0, q1, [%[in]], #32       \n" /* load 16 int16*/
+
+          "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
+          "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
+          "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
+          "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
+
+          "subs    %[loop], %[loop], #1       \n"
+
+          "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
+          "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
+          "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
+          "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
+
+          "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
+          "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
+
+          "bne     0b                         \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
 #else
-            asm volatile(
-            "vld1.32    {d0-d3},    [%[in]]!            @ load 16 int16\n"
-                    "0:                                 @ main loop\n"
-                    "vmovl.s16     q4, d0               @ trans to int32\n"
-                    "vmovl.s16     q5, d1               @ trans to int32\n"
-                    "vmovl.s16     q6, d2               @ trans to int32\n"
-                    "vmovl.s16     q7, d3               @ trans to int32\n"
-                    "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
-                    "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
-                    "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
-                    "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
-                    "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
-                    "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
-                    "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
-                    "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
-
-                    "vld1.32    {d0-d3},    [%[in]]!    @ load 16 int8\n"
-
-                    "subs          %[loop], #1            \n"
-
-                    "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
-                    "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
-
-                    "bne           0b                     \n"
-            :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-            :[scale] "w" (vscale)
-            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
-            );
-#endif //__aarch64__
-        }
-        const short* din_r = din_c + 16 * cnt;
-        float* dout_r = dout_c + 16 * cnt;
-        for (int i = 0; i < remain; ++i) {
-            dout_r[i] = in_scale * din_r[i];
-        }
+      asm volatile(
+          "vld1.32    {d0-d3},    [%[in]]!            @ load 16 int16\n"
+          "0:                                 @ main loop\n"
+          "vmovl.s16     q4, d0               @ trans to int32\n"
+          "vmovl.s16     q5, d1               @ trans to int32\n"
+          "vmovl.s16     q6, d2               @ trans to int32\n"
+          "vmovl.s16     q7, d3               @ trans to int32\n"
+          "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
+          "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
+          "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
+          "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
+          "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
+          "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
+          "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
+          "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
+
+          "vld1.32    {d0-d3},    [%[in]]!    @ load 16 int8\n"
+
+          "subs          %[loop], #1            \n"
+
+          "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
+          "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
+
+          "bne           0b                     \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+#endif  // __aarch64__
+    }
+    const int16_t* din_r = din_c + 16 * cnt;
+    float* dout_r = dout_c + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      dout_r[i] = in_scale * din_r[i];
    }
+  }
 }

 void int32_to_fp32(const int* din, float* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-    int cnt = inner_size / 16;
-    int remain = inner_size & 15;
-    long long loop_size = axis_size * outer_size;
+                   int axis_size, int64_t outer_size, int64_t inner_size) {
+  int cnt = inner_size / 16;
+  int remain = inner_size & 15;
+  int64_t loop_size = axis_size * outer_size;
 #pragma omp parallel for
-    for (long long n = 0; n < loop_size; ++n) {
-        float in_scale = scale[n % axis_size];
-        const int* din_c = din + n * inner_size;
-        float* dout_c = dout + n * inner_size;
-        float32x4_t vscale = vdupq_n_f32(in_scale);
-        if (cnt > 0) {
-            int loop = cnt;
-            const int* din_ptr = din_c;
-            float* dout_ptr = dout_c;
+  for (int64_t n = 0; n < loop_size; ++n) {
+    float in_scale = scale[n % axis_size];
+    const int* din_c = din + n * inner_size;
+    float* dout_c = dout + n * inner_size;
+    float32x4_t vscale = vdupq_n_f32(in_scale);
+    if (cnt > 0) {
+      int loop = cnt;
+      const int* din_ptr = din_c;
+      float* dout_ptr = dout_c;
 #ifdef __aarch64__
-            asm volatile(
-            "ldp     q0, q1, [%[in]], #32               \n"
-                    "ldp  q2, q3, [%[in]], #32          \n"
-                    "0:                                 \n"
-                    "scvtf   v4.4s, v0.4s               \n"
-                    "scvtf   v5.4s, v1.4s               \n"
-                    "scvtf   v6.4s, v2.4s               \n"
-                    "scvtf   v7.4s, v3.4s               \n"
-                    "ldp  q0, q1, [%[in]], #32          \n"
-                    "fmul    v8.4s, v4.4s, %[scale].4s  \n"
-                    "fmul    v9.4s, v5.4s, %[scale].4s  \n"
-                    "fmul    v10.4s, v6.4s, %[scale].4s \n"
-                    "fmul    v11.4s, v7.4s, %[scale].4s \n"
-                    "ldp  q2, q3, [%[in]], #32          \n"
-                    "stp     q8, q9, [%[out]], #32      \n"
-                    "stp     q10, q11, [%[out]], #32    \n"
-                    "subs    %[loop], %[loop], #1       \n"
-                    "bne     0b                         \n"
-                 :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-                 :[scale] "w" (vscale)
-                 :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
-                 );
+      asm volatile(
+          "ldp     q0, q1, [%[in]], #32               \n"
+          "ldp  q2, q3, [%[in]], #32          \n"
+          "0:                                 \n"
+          "scvtf   v4.4s, v0.4s               \n"
+          "scvtf   v5.4s, v1.4s               \n"
+          "scvtf   v6.4s, v2.4s               \n"
+          "scvtf   v7.4s, v3.4s               \n"
+          "ldp  q0, q1, [%[in]], #32          \n"
+          "fmul    v8.4s, v4.4s, %[scale].4s  \n"
+          "fmul    v9.4s, v5.4s, %[scale].4s  \n"
+          "fmul    v10.4s, v6.4s, %[scale].4s \n"
+          "fmul    v11.4s, v7.4s, %[scale].4s \n"
+          "ldp  q2, q3, [%[in]], #32          \n"
+          "stp     q8, q9, [%[out]], #32      \n"
+          "stp     q10, q11, [%[out]], #32    \n"
+          "subs    %[loop], %[loop], #1       \n"
+          "bne     0b                         \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+            "v11");
 #else
-            asm volatile(
-            "vld1.s32       {d0-d3}, [%[in]]!               \n"
-                    "vld1.s32       {d4-d7}, [%[in]]!       \n"
-                    "0:                                     \n"
-                    "vcvt.f32.s32   q4, q0                  \n"
-                    "vcvt.f32.s32   q5, q1                  \n"
-                    "vcvt.f32.s32   q6, q2                  \n"
-                    "vcvt.f32.s32   q7, q3                  \n"
-                    "vld1.s32       {d0-d3}, [%[in]]!       \n"
-                    "vmul.f32       q8, q4, %q[scale]       \n"
-                    "vmul.f32       q9, q5, %q[scale]       \n"
-                    "vmul.f32       q10, q6, %q[scale]      \n"
-                    "vmul.f32       q11, q7, %q[scale]      \n"
-                    "vld1.s32       {d4-d7}, [%[in]]!       \n"
-                    "subs           %[loop], #1             \n"
-                    "vst1.f32       {d16-d19}, [%[out]]!    \n"
-                    "vst1.f32       {d20-d23}, [%[out]]!    \n"
-                    "bne            0b                      \n"
-            :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-            :[scale] "w" (vscale)
-            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"
-            );
-#endif //__aarch64__
-        }
-        const int* din_r = din_c + 16 * cnt;
-        float* dout_r = dout_c + 16 * cnt;
-        for (int i = 0; i < remain; ++i) {
-            dout_r[i] = in_scale * din_r[i];
-        }
+      asm volatile(
+          "vld1.s32       {d0-d3}, [%[in]]!               \n"
+          "vld1.s32       {d4-d7}, [%[in]]!       \n"
+          "0:                                     \n"
+          "vcvt.f32.s32   q4, q0                  \n"
+          "vcvt.f32.s32   q5, q1                  \n"
+          "vcvt.f32.s32   q6, q2                  \n"
+          "vcvt.f32.s32   q7, q3                  \n"
+          "vld1.s32       {d0-d3}, [%[in]]!       \n"
+          "vmul.f32       q8, q4, %q[scale]       \n"
+          "vmul.f32       q9, q5, %q[scale]       \n"
+          "vmul.f32       q10, q6, %q[scale]      \n"
+          "vmul.f32       q11, q7, %q[scale]      \n"
+          "vld1.s32       {d4-d7}, [%[in]]!       \n"
+          "subs           %[loop], #1             \n"
+          "vst1.f32       {d16-d19}, [%[out]]!    \n"
+          "vst1.f32       {d20-d23}, [%[out]]!    \n"
+          "bne            0b                      \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+            "q11");
+#endif  // __aarch64__
    }
+    const int* din_r = din_c + 16 * cnt;
+    float* dout_r = dout_c + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      dout_r[i] = in_scale * din_r[i];
+    }
+  }
 }

-void int32_to_int8(const int* din, signed char* dout, const float* scale, \
-    int axis_size, long long outer_size, long long inner_size) {
-    int cnt = inner_size / 16;
-    int remain = inner_size & 15;
-    long long loop_size = outer_size * axis_size;
+void int32_to_int8(const int* din, signed char* dout, const float* scale,
+                   int axis_size, int64_t outer_size, int64_t inner_size) {
+  int cnt = inner_size / 16;
+  int remain = inner_size & 15;
+  int64_t loop_size = outer_size * axis_size;
 #pragma omp parallel for
-    for (long long n = 0; n < loop_size; ++n) {
-        float in_scale = scale[n % axis_size];
-        const int* din_c = din + n * inner_size;
-        signed char* dout_c = dout + n * inner_size;
-        float32x4_t vscale = vdupq_n_f32(in_scale);
-        float32x4_t vzero = vdupq_n_f32(0.f);
-        float32x4_t vpoff = vdupq_n_f32(0.5f);
-        float32x4_t vnoff = vdupq_n_f32(-0.5f);
-        if (cnt > 0) {
-            int loop = cnt;
-            const int* din_ptr = din_c;
-            signed char* dout_ptr = dout_c;
+  for (int64_t n = 0; n < loop_size; ++n) {
+    float in_scale = scale[n % axis_size];
+    const int* din_c = din + n * inner_size;
+    signed char* dout_c = dout + n * inner_size;
+    float32x4_t vscale = vdupq_n_f32(in_scale);
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    float32x4_t vpoff = vdupq_n_f32(0.5f);
+    float32x4_t vnoff = vdupq_n_f32(-0.5f);
+    if (cnt > 0) {
+      int loop = cnt;
+      const int* din_ptr = din_c;
+      signed char* dout_ptr = dout_c;
 #ifdef __aarch64__
-            asm volatile(
-                 "0:                                        \n"
-                 "ld1     {v0.4s, v1.4s}, [%[in]], #32      \n"
-                 "ld1     {v2.4s, v3.4s}, [%[in]], #32      \n"
-
-                 "scvtf   v4.4s, v0.4s                      \n"
-                 "scvtf   v5.4s, v1.4s                      \n"
-                 "scvtf   v6.4s, v2.4s                      \n"
-                 "scvtf   v7.4s, v3.4s                      \n"
-
-                 "fmul    v0.4s, v4.4s, %[scale].4s         \n"
-                 "fmul    v1.4s, v5.4s, %[scale].4s         \n"
-                 "fmul    v2.4s, v6.4s, %[scale].4s         \n"
-                 "fmul    v3.4s, v7.4s, %[scale].4s         \n"
-
-                 "fcvtas  v4.4s, v0.4s                      \n"
-                 "fcvtas  v5.4s, v1.4s                      \n"
-                 "fcvtas  v6.4s, v2.4s                      \n"
-                 "fcvtas  v7.4s, v3.4s                      \n"
-
-                 "sqxtn   v0.4h, v4.4s                      \n"
-                 "sqxtn2  v0.8h, v5.4s                      \n"
-                 "sqxtn   v1.4h, v6.4s                      \n"
-                 "sqxtn2  v1.8h, v7.4s                      \n"
-
-                 "sqxtn   v2.8b, v0.8h                      \n"
-                 "sqxtn2  v2.16b, v1.8h                     \n"
-
-                 "st1     {v2.16b}, [%[out]], #16           \n"
-                 "subs    %[loop], %[loop], #1              \n"
-                 "bne     0b                                \n"
-                 :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
-                 :[scale] "w" (vscale)
-                 :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-                 );
+      asm volatile(
+          "0:                                        \n"
+          "ld1     {v0.4s, v1.4s}, [%[in]], #32      \n"
+          "ld1     {v2.4s, v3.4s}, [%[in]], #32      \n"
+
+          "scvtf   v4.4s, v0.4s                      \n"
+          "scvtf   v5.4s, v1.4s                      \n"
+          "scvtf   v6.4s, v2.4s                      \n"
+          "scvtf   v7.4s, v3.4s                      \n"
+
+          "fmul    v0.4s, v4.4s, %[scale].4s         \n"
+          "fmul    v1.4s, v5.4s, %[scale].4s         \n"
+          "fmul    v2.4s, v6.4s, %[scale].4s         \n"
+          "fmul    v3.4s, v7.4s, %[scale].4s         \n"
+
+          "fcvtas  v4.4s, v0.4s                      \n"
+          "fcvtas  v5.4s, v1.4s                      \n"
+          "fcvtas  v6.4s, v2.4s                      \n"
+          "fcvtas  v7.4s, v3.4s                      \n"
+
+          "sqxtn   v0.4h, v4.4s                      \n"
+          "sqxtn2  v0.8h, v5.4s                      \n"
+          "sqxtn   v1.4h, v6.4s                      \n"
+          "sqxtn2  v1.8h, v7.4s                      \n"
+
+          "sqxtn   v2.8b, v0.8h                      \n"
+          "sqxtn2  v2.16b, v1.8h                     \n"
+
+          "st1     {v2.16b}, [%[out]], #16           \n"
+          "subs    %[loop], %[loop], #1              \n"
+          "bne     0b                                \n"
+          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
+          : [scale] "w"(vscale)
+          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 #else
-            asm volatile(
-            "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
-                    "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
-                    "0:                                     @ main loop\n"
-                    "vcvt.f32.s32   q4, q0                  @ cvt to float\n"
-                    "vcvt.f32.s32   q5, q1                  @ cvt to float\n"
-                    "vcvt.f32.s32   q6, q2                  @ cvt to float\n"
-                    "vcvt.f32.s32   q7, q3                  @ cvt to float\n"
-                    "vand.i32   q0, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-                    "vand.i32   q1, q0, q0                  @ set offset, 0.5\n"
-                    "vand.i32   q2, q0, q0                  @ set offset, 0.5\n"
-                    "vand.i32   q3, q0, q0                  @ set offset, 0.5\n"
-                    "vcgt.f32   q8, q4, %q[vzero]           @ get mask > 0, in0\n"
-                    "vcgt.f32   q9, q5, %q[vzero]           @ get mask > 0, in1\n"
-                    "vcgt.f32   q10, q6, %q[vzero]          @ get mask > 0, in2\n"
-                    "vcgt.f32   q11, q7, %q[vzero]          @ get mask > 0, in3\n"
-                    "vbif.f32   q0, %q[vnoff], q8           @ get right offset\n"
-                    "vbif.f32   q1, %q[vnoff], q9           @ get right offset\n"
-                    "vbif.f32   q2, %q[vnoff], q10          @ get right offset\n"
-                    "vbif.f32   q3, %q[vnoff], q11          @ get right offset\n"
-                    "vmla.f32   q0, q4, %q[vscale]          @ mul scale\n"
-                    "vmla.f32   q1, q5, %q[vscale]          @ mul scale\n"
-                    "vmla.f32   q2, q6, %q[vscale]          @ mul scale\n"
-                    "vmla.f32   q3, q7, %q[vscale]          @ mul scale\n"
-                    "vcvt.s32.f32  q4, q0                   @ cvt to int32\n"
-                    "vcvt.s32.f32  q5, q1                   @ cvt to int32\n"
-                    "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
-                    "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-                    "vqmovn.s32 d16, q4                     @ cnt to int16\n"
-                    "vqmovn.s32 d17, q5                     @ cnt to int16\n"
-                    "vqmovn.s32 d18, q6                     @ cnt to int16\n"
-                    "vqmovn.s32 d19, q7                     @ cnt to int16\n"
-                    "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
-                    "vqmovn.s16 d8, q8                      @ cnt to int8\n"
-                    "vqmovn.s16 d9, q9                      @ cnt to int8\n"
-                    "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
-                    "vst1.32 {d8-d9},    [%[dout]]!         @ write to output\n"
-                    "subs   %[loop], #1                     @ loop count -1\n"
-                    "bne    0b                              @ to main loop\n"
-            :[loop] "+r" (loop), [din] "+r" (din_ptr), [dout] "+r" (dout_ptr)
-            :[vscale] "w" (vscale), [vzero] "w"(vzero), [vnoff] "w" (vnoff), [vpoff] "w" (vpoff)
-            :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"
-            );
-#endif //__aarch64__
-        }
-        const int* din_r = din_c + 16 * cnt;
-        int8_t* dout_r = dout_c + 16 * cnt;
-        for (int i = 0; i < remain; ++i) {
-            dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
-        }
+      asm volatile(
+          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
+          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
+          "0:                                     @ main loop\n"
+          "vcvt.f32.s32   q4, q0                  @ cvt to float\n"
+          "vcvt.f32.s32   q5, q1                  @ cvt to float\n"
+          "vcvt.f32.s32   q6, q2                  @ cvt to float\n"
+          "vcvt.f32.s32   q7, q3                  @ cvt to float\n"
+          "vand.i32   q0, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
+          "vand.i32   q1, q0, q0                  @ set offset, 0.5\n"
+          "vand.i32   q2, q0, q0                  @ set offset, 0.5\n"
+          "vand.i32   q3, q0, q0                  @ set offset, 0.5\n"
+          "vcgt.f32   q8, q4, %q[vzero]           @ get mask > 0, in0\n"
+          "vcgt.f32   q9, q5, %q[vzero]           @ get mask > 0, in1\n"
+          "vcgt.f32   q10, q6, %q[vzero]          @ get mask > 0, in2\n"
+          "vcgt.f32   q11, q7, %q[vzero]          @ get mask > 0, in3\n"
+          "vbif.f32   q0, %q[vnoff], q8           @ get right offset\n"
+          "vbif.f32   q1, %q[vnoff], q9           @ get right offset\n"
+          "vbif.f32   q2, %q[vnoff], q10          @ get right offset\n"
+          "vbif.f32   q3, %q[vnoff], q11          @ get right offset\n"
+          "vmla.f32   q0, q4, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q1, q5, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q2, q6, %q[vscale]          @ mul scale\n"
+          "vmla.f32   q3, q7, %q[vscale]          @ mul scale\n"
+          "vcvt.s32.f32  q4, q0                   @ cvt to int32\n"
+          "vcvt.s32.f32  q5, q1                   @ cvt to int32\n"
+          "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
+          "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
+          "vqmovn.s32 d16, q4                     @ cnt to int16\n"
+          "vqmovn.s32 d17, q5                     @ cnt to int16\n"
+          "vqmovn.s32 d18, q6                     @ cnt to int16\n"
+          "vqmovn.s32 d19, q7                     @ cnt to int16\n"
+          "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
+          "vqmovn.s16 d8, q8                      @ cnt to int8\n"
+          "vqmovn.s16 d9, q9                      @ cnt to int8\n"
+          "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
+          "vst1.32 {d8-d9},    [%[dout]]!         @ write to output\n"
+          "subs   %[loop], #1                     @ loop count -1\n"
+          "bne    0b                              @ to main loop\n"
+          : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
+          : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff),
+            [vpoff] "w"(vpoff)
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
+            "q11");
+#endif  // __aarch64__
+    }
+    const int* din_r = din_c + 16 * cnt;
+    int8_t* dout_r = dout_c + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
    }
+  }
 }

-void int32_to_int32(const int* din, int* dout, const float* scale, \
-    int axis_size, long long outer_size, long long inner_size) {
-    int size_all = outer_size * axis_size * inner_size;
-    memmove(dout, din, size_all*sizeof(int));
+void int32_to_int32(const int* din, int* dout, const float* scale,
+                    int axis_size, int64_t outer_size, int64_t inner_size) {
+  int size_all = outer_size * axis_size * inner_size;
+  memmove(dout, din, size_all * sizeof(int));
 }

 template <>
 void int32_to_dtype(const int* din, float* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-
-    return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
+                    int axis_size, int64_t outer_size, int64_t inner_size) {
+  return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
 }

 template <>
 void int32_to_dtype(const int* din, signed char* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-
-    return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
+                    int axis_size, int64_t outer_size, int64_t inner_size) {
+  return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
 }

 template <>
 void int32_to_dtype(const int* din, int* dout, const float* scale,
-    int axis_size, long long outer_size, long long inner_size) {
-
-    return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
+                    int axis_size, int64_t outer_size, int64_t inner_size) {
+  return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
 }

 }  // namespace math

--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -24,13 +24,14 @@ cc_library(variable_lite SRCS variable.cc)
 cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
 cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
 cc_library(cpu_info_lite SRCS cpu_info.cc)
-cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite)
+lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
 cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
  cpp_op_desc_lite ${tensor_lite})
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)

-lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto)
+lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto
+    PROFILE_DEPS basic_profiler_lite)
 cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)

 add_subdirectory(mir)
@@ -56,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
 lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
 lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
 lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
+ 
--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) {
              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
              << ", CPU ARCH: A" << dev->archs_[i];
  }
-  LOG(INFO) << "L1 DataCache size is: ";
+  VLOG(1) << "L1 DataCache size is: ";
  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB";
+    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
  }
-  LOG(INFO) << "L2 Cache size is: ";
+  VLOG(1) << "L2 Cache size is: ";
  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB";
+    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
  }
-  LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB";
+  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";

  dev->max_freq_ = max_freq[0];
  for (int j = 1; j < dev->compute_core_num_; ++j) {

--- a/paddle/fluid/lite/core/hvy_tensor.h
+++ b/paddle/fluid/lite/core/hvy_tensor.h
@@ -107,6 +107,8 @@ class TensorHvy : public TensorBase<TensorHvy> {
    data_.Resize(framework::make_ddim(dims.Vectorize()));
  }

+  void Resize(const std::vector<int64_t>& x) { Resize(DDimHvy(x)); }
+
  void ShareDataWith(const TensorHvy& other) {
    data_.ShareDataWith(other.data_);
  }

--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
 cc_library(mir_node SRCS node.cc DEPS framework_proto_lite)
-cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node)
+cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program_lite)
 cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph)
 cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes)
 cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
@@ -20,14 +20,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    return()
 endif()
 cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
-cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
-        mir_ssa_graph scope_lite op_lite
-        fc_op_lite
-        ${host_kernels}
-        mir_passes
-        mir_pass_manager
-        program_fake_utils
-        )
+#cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
+        #mir_ssa_graph scope_lite op_lite
+        #fc_op_lite
+        #${host_kernels}
+        #mir_passes
+        #mir_pass_manager
+        #program_fake_utils
+        #)
 # lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
 #   DEPS
 #       mul_op_lite
@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
        mir_passes compatible_pb_lite program_lite ${ops_lite})
 endif()
+ 
--- a/paddle/fluid/lite/core/naive_test_model.py
+++ b/paddle/fluid/lite/core/naive_test_model.py
@@ -18,10 +18,10 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.backward import append_backward

-a = fluid.layers.data(name="a", shape=[100], dtype='float32')
-label = fluid.layers.data(name="label", shape=[100], dtype='float32')
+a = fluid.layers.data(name="a", shape=[2], dtype='float32')
+label = fluid.layers.data(name="label", shape=[10], dtype='float32')

-a1 = fluid.layers.fc(input=a, size=500, act=None, bias_attr=False)
+a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False)

 cost = fluid.layers.square_error_cost(a1, label)
 avg_cost = fluid.layers.mean(cost)
@@ -36,7 +36,7 @@ exe.run(fluid.default_startup_program())
 with open('startup_program.pb', 'wb') as f:
    f.write(fluid.default_startup_program().desc.serialize_to_string())

-data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
+#data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')

 #fluid.default_main_program().desc.

@@ -50,7 +50,7 @@ with open('main_program.pb', 'wb') as f:

 #outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost])

-sys.exit(0)
+#sys.exit(0)
 fluid.io.save_inference_model("./model2", [a.name], [a1], exe)

-print(numpy.array(outs))
+#print(numpy.array(outs))
--- a/paddle/fluid/lite/core/profile/CMakeLists.txt
+++ b/paddle/fluid/lite/core/profile/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()

 lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
+ 
--- a/paddle/fluid/lite/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/cuda/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()

 nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
 nv_library(cuda_blas_lite SRCS blas.cc)
+ 
--- a/paddle/fluid/lite/gen_code/CMakeLists.txt
+++ b/paddle/fluid/lite/gen_code/CMakeLists.txt
@@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        DEPS scope_lite op_lite kernel_lite paddle_infer_gencode
    )

-    lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
-      ${ops_lite} ${host_kernels}
-      X86_DEPS ${x86_kernels}
-      )
+  #    lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
+  #    ${ops_lite} ${host_kernels}
+  #    X86_DEPS ${x86_kernels}
+  #    )

-    add_dependencies(__generated_code__ test_gen_code_lite)
+  #  add_dependencies(__generated_code__ test_gen_code_lite)
 endif()
+ 
--- a/paddle/fluid/lite/host/CMakeLists.txt
+++ b/paddle/fluid/lite/host/CMakeLists.txt
 cc_library(target_wrapper_host SRCS target_wrapper.cc)
+ 
--- a/paddle/fluid/lite/kernels/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(arm)
 add_subdirectory(cuda)
 add_subdirectory(x86)
 
+ 
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -6,10 +6,11 @@ message(STATUS "compile with lite ARM kernels")

 cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
+cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -18,8 +19,10 @@ lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm mat
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
+lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
 lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
+lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)

 set(arm_kernels
@@ -29,6 +32,7 @@ set(arm_kernels
    scale_compute_arm
    softmax_compute_arm
    conv_compute_arm
+    batch_norm_compute_arm
    elementwise_add_compute_arm
    pool_compute_arm
    split_compute_arm
@@ -36,3 +40,4 @@ set(arm_kernels

 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
 
+ 
--- a/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void BatchNormCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    int64_t channel_size = 0;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW):
+        channel_size = x_dims[1];
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+    new_scale.Resize({channel_size});
+    new_bias.Resize({channel_size});
+    auto* scale_data = param.scale->mutable_data<float>();
+    auto* bias_data = param.bias->mutable_data<float>();
+    auto* mean_data = param.mean->mutable_data<float>();
+    auto* variance_data = param.variance->mutable_data<float>();
+    auto* new_scale_data = new_scale.mutable_data<float>();
+    auto* new_bias_data = new_bias.mutable_data<float>();
+    for (int c = 0; c < channel_size; c++) {
+      float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon));
+      new_bias_data[c] =
+          bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+      new_scale_data[c] = inv_scale * scale_data[c];
+    }
+  }
+}
+
+void BatchNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto x_data = param.x->mutable_data<float>();
+  auto y_data = param.y->mutable_data<float>();
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    auto* new_scale_data = new_scale.mutable_data<float>();
+    auto* new_bias_data = new_bias.mutable_data<float>();
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW):
+        outer_size = x_dims[0];
+        channel_size = x_dims[1];
+        inner_size = x_dims.Slice(2, x_dims.size()).production();
+        lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
+                               inner_size, new_scale_data, new_bias_data);
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
+      //                          new_scale_data, new_bias_data);
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  } else {
+    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
+    // saved_variance
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::BatchNormCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/batch_norm_compute.h
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class BatchNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~BatchNormCompute() = default;
+
+ private:
+  Tensor new_scale;
+  Tensor new_bias;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename dtype>
+void batch_norm_compute_ref(const operators::BatchNormParam& param) {
+  DDim x_dims = param.x->dims();
+  auto x_data = param.x->mutable_data<dtype>();
+  auto scale_data = param.scale->mutable_data<dtype>();
+  auto bias_data = param.bias->mutable_data<dtype>();
+  auto mean_data = param.mean->mutable_data<dtype>();
+  auto variance_data = param.variance->mutable_data<dtype>();
+  auto y_data = param.y->mutable_data<dtype>();
+  float epsilon = param.epsilon;
+  float momentum = param.momentum;
+  DataLayoutType data_layout = param.data_layout;
+
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    switch (data_layout) {
+      case DATALAYOUT(kNCHW):
+        outer_size = x_dims[0];
+        channel_size = x_dims[1];
+        inner_size = x_dims.Slice(2, x_dims.size()).production();
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   inner_size = 1;
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout);
+        break;
+    }
+    auto x_ptr = x_data;
+    auto y_ptr = y_data;
+    for (int o = 0; o < outer_size; o++) {
+      for (int c = 0; c < channel_size; c++) {
+        for (int i = 0; i < inner_size; i++) {
+          dtype norm_x =
+              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
+          *y_ptr = norm_x * scale_data[c] + bias_data[c];
+          x_ptr++;
+          y_ptr++;
+        }
+      }
+    }
+  } else {
+    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
+    // saved_variance
+  }
+}
+
+TEST(batch_norm_arm, retrive_op) {
+  auto batch_norm =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "batch_norm");
+  ASSERT_FALSE(batch_norm.empty());
+  ASSERT_TRUE(batch_norm.front());
+}
+
+TEST(batch_norm_arm, init) {
+  BatchNormCompute batch_norm;
+  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(batch_norm.target(), TARGET(kARM));
+}
+
+TEST(batch_norm_arm, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto c : {6, 32 /*, 128*/}) {
+      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
+        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
+          for (auto is_test : {/*false, */ true}) {
+            for (auto use_global_stats : {false, true}) {
+              for (auto epsilon : {1e-4f, 1e-5f}) {
+                for (auto momentum : {0.9f, 0.99f}) {
+                  for (auto data_layout :
+                       {DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) {
+                    Tensor x;
+                    Tensor scale;
+                    Tensor bias;
+                    Tensor mean;
+                    Tensor variance;
+                    Tensor y;
+                    Tensor mean_out;
+                    Tensor variance_out;
+                    Tensor saved_mean;
+                    Tensor saved_variance;
+                    Tensor y_ref;
+                    Tensor mean_out_ref;
+                    Tensor variance_out_ref;
+                    Tensor saved_mean_ref;
+                    Tensor saved_variance_ref;
+                    // set the dims of input, output, ref output tensors
+                    std::vector<int64_t> in_out_shape;
+                    switch (data_layout) {
+                      case DATALAYOUT(kNCHW):
+                        in_out_shape = {n, c, h, w};
+                        break;
+                      // case DATALAYOUT(kNHWC):
+                      //   in_out_shape = {n, h, w, c};
+                      //   break;
+                      default:
+                        LOG(FATAL) << "Unknown storage order: "
+                                   << DataLayoutToStr(data_layout);
+                        break;
+                    }
+                    x.Resize(in_out_shape);
+                    scale.Resize({c});
+                    bias.Resize({c});
+                    mean.Resize({c});
+                    variance.Resize({c});
+                    y.Resize(in_out_shape);
+                    mean_out.Resize({c});
+                    variance_out.Resize({c});
+                    saved_mean.Resize({c});
+                    saved_variance.Resize({c});
+                    y_ref.Resize(in_out_shape);
+                    mean_out_ref.Resize({c});
+                    variance_out_ref.Resize({c});
+                    saved_mean_ref.Resize({c});
+                    saved_variance_ref.Resize({c});
+                    // initialize the data of input tensors
+                    auto* x_data = x.mutable_data<float>();
+                    auto* scale_data = scale.mutable_data<float>();
+                    auto* bias_data = bias.mutable_data<float>();
+                    auto* mean_data = mean.mutable_data<float>();
+                    auto* variance_data = variance.mutable_data<float>();
+                    auto* y_data = y.mutable_data<float>();
+                    for (int i = 0; i < x.dims().production(); i++) {
+                      x_data[i] = static_cast<float>(i % 64);
+                    }
+                    for (int i = 0; i < scale.dims().production(); i++) {
+                      scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+                    }
+                    for (int i = 0; i < bias.dims().production(); i++) {
+                      bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+                    }
+                    for (int i = 0; i < mean.dims().production(); i++) {
+                      mean_data[i] = static_cast<float>(i) * 0.0565f;
+                    }
+                    for (int i = 0; i < variance.dims().production(); i++) {
+                      variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+                    }
+                    // prepare kernel params and run
+                    BatchNormCompute batch_norm;
+                    std::unique_ptr<KernelContext> ctx(new KernelContext);
+                    ctx->As<ARMContext>();
+                    batch_norm.SetContext(std::move(ctx));
+                    operators::BatchNormParam param;
+                    param.x = &x;
+                    param.scale = &scale;
+                    param.bias = &bias;
+                    param.mean = &mean;
+                    param.variance = &variance;
+                    param.is_test = is_test;
+                    param.use_global_stats = use_global_stats;
+                    param.epsilon = epsilon;
+                    param.momentum = momentum;
+                    param.data_layout = data_layout;
+                    param.y = &y;
+                    param.mean_out = &mean_out;
+                    param.variance_out = &variance_out;
+                    param.saved_mean = &saved_mean;
+                    param.saved_variance = &saved_variance;
+                    batch_norm.SetParam(param);
+                    batch_norm.Launch();
+                    // invoking ref implementation and compare results
+                    param.y = &y_ref;
+                    param.mean_out = &mean_out_ref;
+                    param.variance_out = &variance_out_ref;
+                    param.saved_mean = &saved_mean_ref;
+                    param.saved_variance = &saved_variance_ref;
+                    batch_norm_compute_ref<float>(param);
+                    auto* y_ref_data = y_ref.mutable_data<float>();
+                    for (int i = 0; i < y.dims().production(); i++) {
+                      EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
@@ -124,6 +124,20 @@ TEST(conv_arm, init) {

 TEST(conv_arm, compute) {
  DeviceInfo::Init();
+#if 1
+  for (auto n : {2}) {
+    for (auto ic : {6}) {
+      for (auto oc : {6}) {
+        for (auto ih : {9}) {
+          for (auto iw : {9}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, true}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {0, 1, 2}) {
+                        for (auto ks : {1, 3, 5}) {
+#else
  for (auto n : {1, 2}) {
    for (auto ic : {6, 32 /*, 128*/}) {
      for (auto oc : {6, 32 /*, 128*/}) {
@@ -136,6 +150,7 @@ TEST(conv_arm, compute) {
                    for (auto stride : {1, 2}) {
                      for (auto padding : {0, 1, 2}) {
                        for (auto ks : {1, 3, 5}) {
+#endif
                          int group = 1;
                          if (depthwise) {  // depthwise convolution ?
                            group = oc = ic;

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -22,7 +22,7 @@ namespace lite {
 namespace kernels {
 namespace arm {

-void FcCompute::Run() {
+void FcCompute::PrepareForRun() {
  auto& param = this->Param<operators::FcParam>();
  auto x_dims = param.input->dims();
  auto w_dims = param.w->dims();
@@ -31,39 +31,56 @@ void FcCompute::Run() {
  CHECK_EQ(w_dims.size(), 2UL);
  CHECK_EQ(param.output->dims().size(), 2UL);

+  m_ = x_dims.Slice(0, param.in_num_col_dims).production();
+  k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
+  n_ = w_dims[1];
+  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+
+  if (m_ == 1) {
+    if (!transed_weight_) {
+      transed_weight_ = new Tensor;
+    }
+    transed_weight_->Resize({n_, k_});
+    const auto* w_data = param.w->data<float>();
+    auto* t_data = transed_weight_->mutable_data<float>();
+    int i = 0;
+
+    for (int nn = 0; nn < n_; ++nn) {
+      for (int kk = 0; kk < k_; ++kk) {
+        t_data[i++] = w_data[kk * n_ + nn];
+      }
+    }
+  }
+}
+
+void FcCompute::Run() {
+  auto& param = this->Param<operators::FcParam>();
+
  const auto* i_data = param.input->data<float>();
  const auto* w_data = param.w->data<float>();
  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
  auto* o_data = param.output->mutable_data<float>();

-  int x_h = x_dims.Slice(0, param.in_num_col_dims).production();
-  int x_w = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-  int n = w_dims[1];
-  CHECK_EQ(x_w, static_cast<int>(w_dims[0]));
  auto& ctx = this->ctx_->template As<ARMContext>();
-  if (x_h > 1) {
+  if (m_ > 1) {
    float* packed_in = static_cast<float*>(ctx.workspace_data<float>()) +
                       ctx.l2_cache_size() / sizeof(float);
-    lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false,
-                              &ctx);
-    lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n,
-                                   x_w, false, false, false, &ctx);
-
+    lite::arm::math::prepackA(packed_in, i_data, k_, 0, m_, 0, k_, false, &ctx);
+    lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, m_, n_,
+                                   k_, false, false, false, &ctx);
    if (param.bias) {
-      CHECK_EQ(param.bias->numel(), n);
-      lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n);
+      CHECK_EQ(param.bias->numel(), n_);
+      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
    }
  } else {
-    // use sgemmv
-    // sgemv((const float*)weights, (const float*)din, (float*)dout,
-    //       false, n, x_w, _param->_flag_bias, (float*)bias, false);
+    CHECK(transed_weight_);
+    const auto* t_data = transed_weight_->data<float>();
+
+    lite::arm::math::sgemv(t_data, i_data, o_data, false, n_, k_,
+                           b_data != nullptr, b_data, false);
  }
 }

-TargetType FcCompute::target() const { return TARGET(kARM); }
-
-PrecisionType FcCompute::precision() const { return PRECISION(kFloat); }
-
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/paddle/fluid/lite/kernels/arm/fc_compute.h
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.h
@@ -25,12 +25,19 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
 public:
  using param_t = operators::FcParam;

+  void PrepareForRun() override;
+
  void Run() override;

-  TargetType target() const override;
-  PrecisionType precision() const override;
+  ~FcCompute() override {
+    if (transed_weight_) {
+      delete transed_weight_;
+    }
+  };

-  virtual ~FcCompute() = default;
+ private:
+  lite::Tensor* transed_weight_{nullptr};
+  int m_, n_, k_;
 };

 }  // namespace arm

--- a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc
@@ -14,6 +14,11 @@

 #include "paddle/fluid/lite/kernels/arm/fc_compute.h"
 #include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
@@ -23,6 +28,17 @@ namespace lite {
 namespace kernels {
 namespace arm {

+template <typename T>
+void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
+              const T upper = static_cast<T>(2.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
 TEST(fc_arm, retrive_op) {
  auto fc =
      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc");
@@ -37,108 +53,117 @@ TEST(fc_arm, init) {
 }

 TEST(fc_arm, compare_test) {
-  lite::Tensor x, w, b, out, ref;
-  constexpr int batch_size = 2;
-  x.Resize({batch_size, 3});
-  w.Resize({3, 4});
-  b.Resize({1, 4});
-  out.Resize({batch_size, 4});
-  ref.Resize({batch_size, 4});
-
-  auto x_data = x.mutable_data<float>();
-  auto w_data = w.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-  auto ref_data = ref.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().product(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < w.dims().product(); i++) {
-    w_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < b.dims().product(); i++) {
-    b_data[i] = static_cast<float>(i);
-  }
-
-  lite::arm::math::fc_compute_eigen(x_data, batch_size, 3,  //
-                                    w_data, 3, 4,           //
-                                    b_data, ref_data);
-
-  // fc compute kernel
-  FcCompute fc;
-  operators::FcParam param;
-
-  param.in_num_col_dims = 1;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &b;
-  param.output = &out;
-  param.in_mat_dims = x.dims();
-
-  DeviceInfo::Init();
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<ARMContext>();
-  fc.SetParam(param);
-  fc.SetContext(std::move(ctx));
-  fc.Run();
-
-  VLOG(3) << "output vs ref";
-  for (int i = 0; i < out.dims().product(); i++) {
-    VLOG(3) << out_data[i] << " vs " << ref_data[i];
-  }
-
-  for (int i = 0; i < out.dims().product(); ++i) {
-    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  using T = float;
+
+  for (int m : {1, 2, 3, 4}) {
+    for (int n : {1, 2, 3, 4}) {
+      for (int k : {1, 2, 3, 4}) {
+        for (bool with_bias : {true, false}) {
+          VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k
+                  << (with_bias ? ", with bias" : "");
+          lite::Tensor x, w, b, out, ref;
+
+          x.Resize({m, k});
+          w.Resize({k, n});
+          b.Resize({1, n});
+          out.Resize({m, n});
+          ref.Resize({m, n});
+
+          auto* x_data = x.mutable_data<T>();
+          auto* w_data = w.mutable_data<T>();
+          auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
+
+          auto* out_data = out.mutable_data<T>();
+          auto* ref_data = ref.mutable_data<T>();
+
+          FillData<T>(x_data, x.dims().production());
+          FillData<T>(w_data, w.dims().production());
+          FillData<T>(out_data, out.dims().production(), 0, 0);
+          FillData<T>(ref_data, ref.dims().production(), 0, 0);
+
+          if (with_bias) {
+            FillData<T>(b_data, b.dims().production());
+          }
+
+          FcCompute fc;
+          operators::FcParam param;
+
+          param.input = &x;
+          param.w = &w;
+          param.bias = with_bias ? &b : nullptr;
+          param.output = &out;
+          param.in_num_col_dims = 1;
+          param.in_mat_dims = x.dims();
+
+          DeviceInfo::Init();
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          ctx->As<ARMContext>();
+          fc.SetParam(param);
+          fc.SetContext(std::move(ctx));
+          fc.PrepareForRun();
+          fc.Run();
+
+          lite::arm::math::fc_compute_eigen(x_data, m, k, w_data, k, n, b_data,
+                                            ref_data);
+          for (int i = 0; i < out.dims().production(); i++) {
+            EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+          }
+        }
+      }
+    }
  }
 }

 TEST(fc_arm, num_col_dims) {
-  FcCompute fc;
-  operators::FcParam param;
-
-  lite::Tensor x;
-  lite::Tensor w;
-  lite::Tensor bias;
-  lite::Tensor output;
-
-  x.Resize({1, 2, 3});
-  w.Resize({3, 4});
-  bias.Resize({1, 4});
-  output.Resize({2, 4});
-
-  auto* x_data = x.mutable_data<float>();
-  auto* w_data = w.mutable_data<float>();
-  auto* bias_data = bias.mutable_data<float>();
-  auto* output_data = output.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().product(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < w.dims().product(); i++) {
-    w_data[i] = static_cast<float>(i);
+  using T = float;
+
+  for (bool with_bias : {true, false}) {
+    lite::Tensor x, w, b, out, ref;
+
+    x.Resize({1, 2, 3});
+    w.Resize({3, 4});
+    b.Resize({1, 4});
+    out.Resize({2, 4});
+    ref.Resize({2, 4});
+
+    auto* x_data = x.mutable_data<float>();
+    auto* w_data = w.mutable_data<float>();
+    auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
+
+    auto* out_data = out.mutable_data<T>();
+    auto* ref_data = ref.mutable_data<T>();
+
+    FillData<T>(x_data, x.dims().production());
+    FillData<T>(w_data, w.dims().production());
+    FillData<T>(out_data, out.dims().production(), 0, 0);
+    FillData<T>(ref_data, ref.dims().production(), 0, 0);
+    if (with_bias) {
+      FillData<T>(b_data, b.dims().production());
+    }
+    FcCompute fc;
+    operators::FcParam param;
+    param.input = &x;
+    param.w = &w;
+    param.bias = with_bias ? &b : nullptr;
+    param.output = &out;
+    param.in_num_col_dims = 2;
+    param.in_mat_dims = x.dims();
+
+    std::unique_ptr<KernelContext> ctx(new KernelContext);
+    ctx->As<ARMContext>();
+    DeviceInfo::Init();
+
+    fc.SetParam(param);
+    fc.SetContext(std::move(ctx));
+    fc.PrepareForRun();
+    fc.Run();
+
+    lite::arm::math::fc_compute_eigen(x_data, 2, 3, w_data, 3, 4, b_data,
+                                      ref_data);
+    for (int i = 0; i < out.dims().production(); i++) {
+      EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+    }
  }
-  for (int64_t i = 0; i < bias.dims().product(); i++) {
-    bias_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < output.dims().product(); i++) {
-    output_data[i] = static_cast<float>(i);
-  }
-
-  param.in_num_col_dims = 2;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &bias;
-  param.output = &output;
-  param.in_mat_dims = x.dims();
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<ARMContext>();
-  DeviceInfo::Init();
-
-  fc.SetParam(param);
-  fc.SetContext(std::move(ctx));
-  fc.Run();
 }

 }  // namespace arm

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -12,57 +12,57 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/core/type_system.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {

-template <typename T>
-void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h,
-                       int y_w, T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+void MulCompute::PrepareForRun() {
+  // TODO(TJ): transpose x or y if necessary
+}

-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> Y(y, y_h, y_w);
-  Eigen::Map<matrix_t> Out(out, x_h, y_w);
+void MulCompute::Run() {
+  auto& param = Param<param_t>();

-  Out = X * Y;
-}
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  auto* o_data = param.output->mutable_data<float>();

-class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
+  int m = static_cast<int>(
+      param.x->dims().Slice(0, param.x_num_col_dims).production());
+  int x_w =
+      static_cast<int>(param.x->dims()
+                           .Slice(param.x_num_col_dims, param.x->dims().size())
+                           .production());
+  int y_h = static_cast<int>(
+      param.y->dims().Slice(0, param.y_num_col_dims).production());
+  int n =
+      static_cast<int>(param.y->dims()
+                           .Slice(param.y_num_col_dims, param.y->dims().size())
+                           .production());

-  void Run() override {
-    auto& param = Param<operators::MulParam>();
-    core::dim2 x_shape(
-        {static_cast<int>(
-             param.x->dims().Slice(0, param.x_num_col_dims).production()),
-         static_cast<int>(
-             param.x->dims()
-                 .Slice(param.x_num_col_dims, param.x->dims().size())
-                 .production())});
-    core::dim2 y_shape(
-        {static_cast<int>(
-             param.y->dims().Slice(0, param.y_num_col_dims).production()),
-         static_cast<int>(
-             param.y->dims()
-                 .Slice(param.y_num_col_dims, param.y->dims().size())
-                 .production())});
+  CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
+  auto k = x_w;
+  if (n == 1) {
+    lite::arm::math::sgemv(x_data, y_data, o_data, false, m, k, false, nullptr,
+                           false);

-    mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y,  //
-                      param.y->data<float>(), y_shape.x, y_shape.y,  //
-                      param.output->mutable_data<float>());
-  }
+  } else {
+    constexpr bool is_tranposed_y = false;
+    auto& ctx = this->ctx_->template As<ARMContext>();

-  virtual ~MulCompute() = default;
-};
+    float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                      ctx.l2_cache_size() / sizeof(float);
+    lite::arm::math::prepackA(packed_x, x_data, k, 0, m, 0, k, false, &ctx);
+    lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m, n, k,
+                                   false, false, is_tranposed_y, &ctx);
+  }
+}

 }  // namespace arm
 }  // namespace kernels

--- a/paddle/fluid/lite/kernels/arm/mul_compute.h
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~MulCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename T>
+void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
+              const T upper = static_cast<T>(2.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+TEST(mul_arm, retrive_op) {
+  auto mul =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+
+TEST(mul_arm, init) {
+  MulCompute mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kARM));
+}
+
+TEST(mul_arm, compare_test) {
+  using T = float;
+
+  for (int m : {1, 2, 3, 4}) {
+    for (int n : {1, 2, 3, 4}) {
+      for (int k : {1, 2, 3, 4}) {
+        VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k;
+        lite::Tensor x, y, out, ref;
+        x.Resize({m, k});
+        y.Resize({k, n});
+        out.Resize({m, n});
+        ref.Resize({m, n});
+
+        auto* x_data = x.mutable_data<T>();
+        auto* y_data = y.mutable_data<T>();
+        auto* out_data = out.mutable_data<T>();
+        auto* ref_data = ref.mutable_data<T>();
+
+        FillData<T>(x_data, x.dims().production());
+        FillData<T>(y_data, y.dims().production());
+        FillData<T>(out_data, out.dims().production(), 0, 0);
+        FillData<T>(ref_data, ref.dims().production(), 0, 0);
+
+        MulCompute mul;
+        operators::MulParam param;
+
+        param.x = &x;
+        param.y = &y;
+        param.output = &out;
+
+        DeviceInfo::Init();
+        std::unique_ptr<KernelContext> ctx(new KernelContext);
+        ctx->As<ARMContext>();
+        mul.SetParam(param);
+        mul.SetContext(std::move(ctx));
+        mul.PrepareForRun();
+
+        mul.Run();
+
+        lite::arm::math::mul_compute_eigen(x_data, m, k, y_data, k, n,
+                                           ref_data);
+        for (int i = 0; i < out.dims().production(); i++) {
+          EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+        }
+      }
+    }
+  }
+}
+
+TEST(mul_arm, num_col_dims) {
+  using T = float;
+
+  lite::Tensor x, y, out, ref;
+  x.Resize({2, 3, 4});
+  y.Resize({3, 4, 5});
+  out.Resize({2, 5});
+  ref.Resize({2, 5});
+
+  auto* x_data = x.mutable_data<T>();
+  auto* y_data = y.mutable_data<T>();
+  auto* out_data = out.mutable_data<T>();
+  auto* ref_data = ref.mutable_data<T>();
+
+  FillData<T>(x_data, x.dims().production());
+  FillData<T>(y_data, y.dims().production());
+  FillData<T>(out_data, out.dims().production());
+  FillData<T>(ref_data, out.dims().production());
+
+  MulCompute mul;
+  operators::MulParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+  param.x_num_col_dims = 1;
+  param.y_num_col_dims = 2;
+
+  DeviceInfo::Init();
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  mul.SetParam(param);
+  mul.SetContext(std::move(ctx));
+  mul.PrepareForRun();
+
+  mul.Run();
+
+  lite::arm::math::mul_compute_eigen(x_data, 2, 12, y_data, 12, 5, ref_data);
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
@@ -182,7 +182,7 @@ TEST(pool_arm, compute) {
      for (auto stride : {2}) {
        for (auto pad : {0}) {
          for (auto n : {1, 3, 4, 11}) {
-            for (auto c : {1, 3, 11, 4, 1024}) {
+            for (auto c : {1, 3, 11 /* ,1024 */}) {  // speedup for ci
              for (auto h : {3, 1, 11, 4, 1}) {
                for (auto w : {1, 3, 4, 12, 1}) {
                  VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w

--- a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
@@ -54,6 +54,15 @@ TEST(scale_arm, compute) {
  lite::Tensor output;
  lite::Tensor output_ref;

+#if 1  // for ci speedup
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (auto h : {3, 4}) {
+        for (auto w : {4, 3}) {
+          for (auto bias_after_scale : {true, false}) {
+            for (auto s : {-1.0f, 0.13f}) {
+              for (auto b : {-15.f, 0.11234f}) {
+#else
  for (auto n : {1, 3, 4, 11}) {
    for (auto c : {1, 3, 11, 4}) {
      for (auto h : {3, 1, 11, 4}) {
@@ -61,6 +70,8 @@ TEST(scale_arm, compute) {
          for (auto bias_after_scale : {true, false}) {
            for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
              for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
+#endif
+
                x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
                output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
                output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));

--- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})

 nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
 
+ 
--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
@@ -13,3 +13,4 @@ set(host_kernels
    )

 set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
+ 
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -35,3 +35,4 @@ set(x86_kernels
    )

 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
+ 
--- a/paddle/fluid/lite/kernels/x86/sgd_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/sgd_compute.cc
@@ -29,9 +29,9 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  using param_t = operators::ActivationParam;

  void Run() override {
-    auto &context = context_->As<X86Context>();
+    auto &context = ctx_->As<X86Context>();
    auto &sgd_param = *param_.get_mutable<operators::SGDParam>();
-    CHECK(context.x86_device_context);
+    CHECK(context.x86_device_context());

    // param.Out->template mutable_data<T>();

@@ -45,12 +45,12 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    PADDLE_ENFORCE_EQ(grad->numel(), sz);

    paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
-    const T *lr = learning_rate->data<T>();
-    const T *param_data = param->data<T>();
-    const T *grad_data = grad->data<T>();
+    const T *lr = learning_rate->template data<T>();
+    const T *param_data = param->template data<T>();
+    const T *grad_data = grad->template data<T>();
    int64_t rows_idx = 0;
-    T *out_data =
-        param_out->mutable_data<T>(context.x86_device_context->GetPlace());
+    T *out_data = param_out->template mutable_data<T>(
+        context.x86_device_context()->GetPlace());

    auto sgd =
        paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,

--- a/paddle/fluid/lite/model_parser/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/CMakeLists.txt
@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des

 add_subdirectory(pb)
 add_subdirectory(cpp)
+ 
--- a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
 cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
+ 
--- a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
 cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite)
 cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite)
+ 
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -8,6 +8,7 @@ cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
 cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS})
 cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS})
 cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} )
+cc_library(batch_norm_op_lite SRCS batch_norm_op.cc DEPS ${op_DEPS})
 cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS})
 cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
 cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
@@ -30,6 +31,7 @@ set(ops_lite
        scale_op_lite
        softmax_op_lite
        reshape_op_lite
+        batch_norm_op_lite
        feed_op_lite
        fetch_op_lite
        io_copy_op_lite
@@ -52,4 +54,6 @@ lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
 lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite)
 lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
 lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
+lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
 lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
+ 
--- a/paddle/fluid/lite/operators/batch_norm_op.cc
+++ b/paddle/fluid/lite/operators/batch_norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/batch_norm_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool BatchNormOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.bias);
+  CHECK_OR_FALSE(param_.scale);
+  CHECK_OR_FALSE(param_.mean);
+  CHECK_OR_FALSE(param_.variance);
+  CHECK_OR_FALSE(param_.y);
+  if (!param_.is_test) {
+    CHECK_OR_FALSE(param_.mean_out);
+    CHECK_OR_FALSE(param_.variance_out);
+    CHECK_OR_FALSE(param_.saved_mean);
+    CHECK_OR_FALSE(param_.saved_variance);
+  }
+  auto x_dims = param_.x->dims();
+  auto scale_dims = param_.scale->dims();
+  auto bias_dims = param_.bias->dims();
+  auto mean_dims = param_.mean->dims();
+  auto variance_dims = param_.variance->dims();
+  CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
+      << "Input X must have 2 to 5 dimensions.";
+  CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
+  CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
+  CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions.";
+  CHECK_EQ(variance_dims.size(), 1UL)
+      << "Input Variance must have 1 dimensions.";
+  return true;
+}
+
+bool BatchNormOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  int64_t channel_size = 0;
+  switch (param_.data_layout) {
+    case DATALAYOUT(kNCHW):
+      channel_size = x_dims[1];
+      break;
+    // case DATALAYOUT(kNHWC):
+    //   channel_size = x_dims[x_dims.size() - 1];
+    //   break;
+    default:
+      LOG(FATAL) << "Unknown storage order: "
+                 << DataLayoutToStr(param_.data_layout);
+      break;
+  }
+  if (!param_.is_test) {
+    param_.mean_out->Resize({channel_size});
+    param_.variance_out->Resize({channel_size});
+    param_.saved_mean->Resize({channel_size});
+    param_.saved_variance->Resize({channel_size});
+  }
+  param_.y->Resize(x_dims);
+  return true;
+}
+
+bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.bias =
+      scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
+  param_.scale =
+      scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
+  param_.mean =
+      scope->FindVar(op_desc.Input("Mean").front())->GetMutable<Tensor>();
+  param_.variance =
+      scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
+  param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
+  param_.is_test = op_desc.GetAttr<bool>("is_test");
+  param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
+  if (!param_.is_test) {
+    param_.mean_out =
+        scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable<Tensor>();
+    param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front())
+                              ->GetMutable<Tensor>();
+    param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front())
+                            ->GetMutable<Tensor>();
+    param_.saved_variance =
+        scope->FindVar(op_desc.Output("SavedVariance").front())
+            ->GetMutable<Tensor>();
+  }
+  param_.epsilon = op_desc.GetAttr<float>("epsilon");
+  param_.momentum = op_desc.GetAttr<float>("momentum");
+  std::string data_layout = op_desc.GetAttr<std::string>("data_layout");
+  CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW.";
+  // param_.data_layout = StringToDataLayout(data_layout);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp);
--- a/paddle/fluid/lite/operators/batch_norm_op.h
+++ b/paddle/fluid/lite/operators/batch_norm_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class BatchNormOp : public OpLite {
+ public:
+  BatchNormOp() {}
+  explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "batch_norm"; }
+
+ private:
+  mutable BatchNormParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/operators/batch_norm_op_test.cc
+++ b/paddle/fluid/lite/operators/batch_norm_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/batch_norm_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+TEST(batch_norm_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* scale = scope.Var("scale")->GetMutable<Tensor>();
+  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
+  auto* mean = scope.Var("mean")->GetMutable<Tensor>();
+  auto* variance = scope.Var("variance")->GetMutable<Tensor>();
+  auto* y = scope.Var("y")->GetMutable<Tensor>();
+  x->Resize({2, 32, 10, 20});
+  auto x_dims = x->dims();
+  const int64_t channel_size = x_dims[1];  // NCHW
+  scale->Resize({channel_size});
+  bias->Resize({channel_size});
+  mean->Resize({channel_size});
+  variance->Resize({channel_size});
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Scale", {"scale"});
+  desc.SetInput("Bias", {"bias"});
+  desc.SetInput("Mean", {"mean"});
+  desc.SetInput("Variance", {"variance"});
+  desc.SetOutput("Y", {"y"});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_global_stats", false);
+  desc.SetAttr("epsilon", 1e-5f);
+  desc.SetAttr("momentum", 0.9f);
+  desc.SetAttr("data_layout", std::string("NCHW"));
+
+  BatchNormOp batch_norm("batch_norm");
+
+  batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
+  batch_norm.Attach(desc, &scope);
+  batch_norm.CheckShape();
+  batch_norm.InferShape();
+
+  // check output dims
+  auto y_dims = y->dims();
+  CHECK_EQ(y_dims.size(), x_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    CHECK_EQ(y_dims[i], x_dims[i]);
+  }
+}
+
+TEST(batch_norm_op_lite, test_enable_is_test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* scale = scope.Var("scale")->GetMutable<Tensor>();
+  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
+  auto* mean = scope.Var("mean")->GetMutable<Tensor>();
+  auto* variance = scope.Var("variance")->GetMutable<Tensor>();
+  auto* y = scope.Var("y")->GetMutable<Tensor>();
+  auto* mean_out = scope.Var("mean_out")->GetMutable<Tensor>();
+  auto* variance_out = scope.Var("variance_out")->GetMutable<Tensor>();
+  auto* saved_mean = scope.Var("saved_mean")->GetMutable<Tensor>();
+  auto* saved_variance = scope.Var("saved_variance")->GetMutable<Tensor>();
+  x->Resize({2, 32, 10, 20});
+  auto x_dims = x->dims();
+  const int64_t channel_size = x_dims[1];  // NCHW
+  scale->Resize({channel_size});
+  bias->Resize({channel_size});
+  mean->Resize({channel_size});
+  variance->Resize({channel_size});
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Scale", {"scale"});
+  desc.SetInput("Bias", {"bias"});
+  desc.SetInput("Mean", {"mean"});
+  desc.SetInput("Variance", {"variance"});
+  desc.SetOutput("Y", {"y"});
+  desc.SetOutput("MeanOut", {"mean_out"});
+  desc.SetOutput("VarianceOut", {"variance_out"});
+  desc.SetOutput("SavedMean", {"saved_mean"});
+  desc.SetOutput("SavedVariance", {"saved_variance"});
+  desc.SetAttr("is_test", false);
+  desc.SetAttr("use_global_stats", false);
+  desc.SetAttr("epsilon", 1e-5f);
+  desc.SetAttr("momentum", 0.9f);
+  desc.SetAttr("data_layout", std::string("NCHW"));
+
+  BatchNormOp batch_norm("batch_norm");
+
+  batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
+  batch_norm.Attach(desc, &scope);
+  batch_norm.CheckShape();
+  batch_norm.InferShape();
+
+  // check output dims
+  auto y_dims = y->dims();
+  CHECK_EQ(y_dims.size(), x_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    CHECK_EQ(y_dims[i], x_dims[i]);
+  }
+  auto mean_out_dims = mean_out->dims();
+  auto variance_out_dims = variance_out->dims();
+  auto saved_mean_dims = saved_mean->dims();
+  auto saved_variance_dims = saved_variance->dims();
+  CHECK_EQ(mean_out_dims.size(), 1UL);
+  CHECK_EQ(variance_out_dims.size(), 1UL);
+  CHECK_EQ(saved_mean_dims.size(), 1UL);
+  CHECK_EQ(saved_variance_dims.size(), 1UL);
+  CHECK_EQ(mean_out_dims[0], channel_size);
+  CHECK_EQ(variance_out_dims[0], channel_size);
+  CHECK_EQ(saved_mean_dims[0], channel_size);
+  CHECK_EQ(saved_variance_dims[0], channel_size);
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
@@ -57,6 +57,7 @@ struct FcParam {
  lite::Tensor* output{};
  lite::DDim in_mat_dims;
  int in_num_col_dims{1};
+  bool weight_transposed{false};
 };

 struct ReluParam {
@@ -145,6 +146,25 @@ struct ConvParam {
  std::string data_format{"Anylayout"};
 };

+// For BatchNorm op
+struct BatchNormParam {
+  lite::Tensor* x{};
+  lite::Tensor* bias{};
+  lite::Tensor* scale{};
+  lite::Tensor* mean{};
+  lite::Tensor* variance{};
+  lite::Tensor* y{};
+  lite::Tensor* mean_out{};
+  lite::Tensor* variance_out{};
+  lite::Tensor* saved_mean{};
+  lite::Tensor* saved_variance{};
+  bool is_test{true};
+  bool use_global_stats{false};
+  float epsilon;
+  float momentum;
+  DataLayoutType data_layout{DATALAYOUT(kNCHW)};
+};
+
 // For Pooling op
 struct PoolParam {
  lite::Tensor* x{};

--- a/paddle/fluid/lite/operators/pool_op_test.cc
+++ b/paddle/fluid/lite/operators/pool_op_test.cc
@@ -74,7 +74,11 @@ TEST(pool_op_lite, test) {
  pool.Attach(desc, &scope);
  auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}});
  LOG(INFO) << "kernels.size(): " << kernels.size();
+#ifdef LITE_WITH_ARM
  ASSERT_FALSE(kernels.empty());
+#else
+  ASSERT_TRUE(kernels.empty());
+#endif
 }

 }  // namespace operators

--- a/paddle/fluid/lite/operators/split_op.cc
+++ b/paddle/fluid/lite/operators/split_op.cc
@@ -37,7 +37,7 @@ bool SplitOp::InferShape() const {
  const auto &sections = param_.sections;

  const int outs_number = outs.size();
-  std::vector<lite::DDimLite> outs_dims;
+  std::vector<lite::DDimHvy> outs_dims;
  outs_dims.reserve(outs_number);

  if (num > 0) {

--- a/paddle/fluid/lite/tools/Dockerfile.mobile
+++ b/paddle/fluid/lite/tools/Dockerfile.mobile
@@ -88,4 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
 RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
 RUN apt-get autoremove -y && apt-get clean
 RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
- 
+ 
\ No newline at end of file
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -13,6 +13,11 @@ function prepare_for_codegen {
    mkdir -p ./paddle/fluid/lite/gen_code
    touch ./paddle/fluid/lite/gen_code/__generated_code__.cc
 }
+
+function check_need_ci {
+    git log -1 --oneline | grep "test=develop" || exit -1
+}
+
 function cmake_x86 {
    prepare_for_codegen
    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
@@ -28,6 +33,17 @@ function cmake_gpu {
    cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
 }

+function check_style {
+    export PATH=/usr/bin:$PATH
+    #pre-commit install
+    clang-format --version
+
+    if ! pre-commit run -a ; then
+        git diff
+        exit 1
+    fi
+}
+
 function cmake_arm {
    # $1: ARM_TARGET_OS in "android" , "armlinux"
    # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
@@ -43,10 +59,15 @@ function cmake_arm {
        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
 }

+function build_single {
+    #make $1 -j$(expr $(nproc) - 2)
+    make $1 -j8
+}
+
 function build {
    file=$1
    for _test in $(cat $file); do
-        make $_test -j$(expr $(nproc) - 2)
+        build_single $_test
    done
 }

@@ -58,44 +79,12 @@ function test_lite {
    for _test in $(cat $file); do
        # We move the build phase here to make the 'gen_code' test compiles after the
        # corresponding test is executed and the C++ code generates.
-        make $_test -j$(expr $(nproc) - 2)
+        #make $_test -j$(expr $(nproc) - 2)
+        make $_test -j8
        ctest -R $_test -V
    done
 }

-port_armv8=5554
-port_armv7=5556
-
-# Run test on android
-function test_lite_android {
-    local file=$1
-    local adb_abi=$2
-    local port=
-    if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-        port=${port_armv7}
-    fi
-
-    if [[ ${adb_abi} == "arm64-v8a" ]]; then
-        port=${port_armv8}
-    fi
-    if [[ "${port}x" == "x" ]]; then
-        echo "Port can not be empty"
-        exit 1
-    fi
-
-    echo "file: ${file}"
-    # push all to adb and test
-    adb_work_dir="/data/local/tmp"
-    skip_list="test_model_parser_lite"
-    for _test in $(cat $file); do
-        [[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test'
-        testpath=$(find ./paddle/fluid -name ${_test})
-        adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-        adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}"
-        adb -s emulator-${port} shell "./${adb_work_dir}/${_test}"
-    done
-}
-
 # Build the code and run lite server tests. This is executed in the CI system.
 function build_test_server {
    mkdir -p ./build
@@ -108,8 +97,34 @@ function build_test_server {
    build $LIBS_FILE
 }

-# Build the code and run lite server tests. This is executed in the CI system.
+# test_arm_android <some_test_name> <adb_port_number>
+function test_arm_android {
+    test_name=$1
+    port=$2
+    if [[ "${test_name}x" == "x" ]]; then
+        echo "test_name can not be empty"
+        exit 1
+    fi
+    if [[ "${port}x" == "x" ]]; then
+        echo "Port can not be empty"
+        exit 1
+    fi
+
+    echo "test name: ${test_name}"
+    adb_work_dir="/data/local/tmp"
+    skip_list="test_model_parser_lite" # add more with space
+    [[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name'
+    testpath=$(find ./paddle/fluid -name ${test_name})
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
+}
+
+# Build the code and run lite arm tests. This is executed in the CI system.
 function build_test_arm {
+    port_armv8=5554
+    port_armv7=5556
+
    adb kill-server
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    # start android arm64-v8a armeabi-v7a emulators first
@@ -122,6 +137,7 @@ function build_test_arm {

    for os in "android" "armlinux" ; do
        for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do
+            # TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux
            if [[ ${abi} == "armeabi-v7a-hf" ]]; then
                echo "armeabi-v7a-hf is not supported on both android and armlinux"
                continue
@@ -138,17 +154,30 @@ function build_test_arm {
            cmake_arm ${os} ${abi}
            build $TESTS_FILE

+            # armlinux need in another docker
+            # TODO(TJ): enable test with armlinux
            if [[ ${os} == "android" ]]; then
                adb_abi=${abi}
                if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then
                    adb_abi="armeabi-v7a"
                fi
                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-                    # skip v7 tests
+                    # skip all armv7 tests
+                    # TODO(TJ): enable test with armv7
                    continue
                fi
-                test_lite_android $TESTS_FILE ${adb_abi}
-                # armlinux need in another docker
+                local port=
+                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
+                    port=${port_armv7}
+                fi
+
+                if [[ ${adb_abi} == "arm64-v8a" ]]; then
+                    port=${port_armv8}
+                fi
+                echo "test file: ${TESTS_FILE}"
+                for _test in $(cat $TESTS_FILE); do
+                    test_arm_android $_test $port
+                done
            fi
            cd -
        done
@@ -164,12 +193,13 @@ function print_usage {
    echo "----------------------------------------"
    echo -e "cmake_x86: run cmake with X86 mode"
    echo -e "cmake_cuda: run cmake with CUDA mode"
-    echo -e "cmake_arm: run cmake with ARM mode"
+    echo -e "--arm_os=<os> --arm_abi=<abi> cmake_arm: run cmake with ARM mode"
    echo
    echo -e "build: compile the tests"
+    echo -e "--test_name=<test_name> build_single: compile single test"
    echo
    echo -e "test_server: run server tests"
-    echo -e "test_mobile: run mobile tests"
+    echo -e "--test_name=<test_name> --adb_port_number=<adb_port_number> test_arm_android: run arm test"
    echo "----------------------------------------"
    echo
 }
@@ -182,11 +212,31 @@ function main {
                TESTS_FILE="${i#*=}"
                shift
                ;;
+            --test_name=*)
+                TEST_NAME="${i#*=}"
+                shift
+                ;;
+            --arm_os=*)
+                ARM_OS="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            --arm_port=*)
+                ARM_PORT="${i#*=}"
+                shift
+                ;;
            build)
                build $TESTS_FILE
                build $LIBS_FILE
                shift
                ;;
+            build_single)
+                build_single $TEST_NAME
+                shift
+                ;;
            cmake_x86)
                cmake_x86
                shift
@@ -196,15 +246,15 @@ function main {
                shift
                ;;
            cmake_arm)
-                cmake_arm $2 $3
+                cmake_arm $ARM_OS $ARM_ABI
                shift
                ;;
            test_server)
                test_lite $TESTS_FILE
                shift
                ;;
-            test_mobile)
-                test_lite $TESTS_FILE
+            test_arm_android)
+                test_arm_android $TEST_NAME $ARM_PORT
                shift
                ;;
            build_test_server)
@@ -215,6 +265,14 @@ function main {
                build_test_arm
                shift
                ;;
+            check_style)
+                check_style
+                shift
+                ;;
+            check_need_ci)
+                check_need_ci
+                shift
+                ;;
            *)
                # unknown option
                print_usage
@@ -224,7 +282,5 @@ function main {
    done
 }

-print_usage
-
 main $@
 
--- a/paddle/fluid/lite/utils/CMakeLists.txt
+++ b/paddle/fluid/lite/utils/CMakeLists.txt
@@ -9,3 +9,4 @@ set(utils_DEPS glog)
 lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite)
 cc_library(any_lite SRCS any.cc)
 cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
+ 
--- a/paddle/fluid/lite/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/x86/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()

 cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
 
+ 
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
+from paddle.proto.ModelConfig_pb2 import ModelConfig