Merge branch 'tangjian/incubate/lite' into 'incubate/lite'

Add ARM backends See merge request inference/paddlelite!4

Merge branch 'tangjian/incubate/lite' into 'incubate/lite'
Add ARM backends See merge request inference/paddlelite!4
5f833603 · tensor-tang · 02010a2a · be931fe7 · 5f833603 · 5f833603
73 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,10 @@ paddle/fluid/operators/distributed/send_recv.proto
 *.vs
 build/
 build_doc/
+build.*
 *.user
+*.sh
+*.bkp

 .vscode
 .idea

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_desc.h"
-#include <glog/logging.h>
 #include <algorithm>
 #include <functional>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"

--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -172,3 +172,4 @@ add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
 add_subdirectory(gen_code)
+ 
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -54,3 +54,4 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
    mir_passes
    ${ops_lite} ${host_kernels}
    ARM_DEPS ${arm_kernels})
+ 
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -32,9 +32,9 @@ void Run(const char* model_dir) {
                  valid_places);

  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({3, 224, 224})));
  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
+  for (int i = 0; i < 3 * 224 * 224; i++) {
    data[i] = i;
  }

@@ -65,6 +65,14 @@ USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);

+USE_LITE_OP(con2d);
+// USE_LITE_OP(batch_norm);
+USE_LITE_OP(relu);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(pool2d);
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(softmax);
+
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);

@@ -72,7 +80,15 @@ USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+
+USE_LITE_KERNEL(con2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_con2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+
 // USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
 // USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
 #endif  // LITE_WITH_ARM

--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -72,8 +72,9 @@ class LightPredictor {

    // Create the kernels of the target places, and filter out the specific
    // kernel with the target alias.
-    for (auto& op : program.ops()) {
-      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+    for (auto& op : program.ops_) {
+      lite::pb::OpDesc desc(op->op_info()->desc());
+      auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
      std::string op_type, alias;
      Place place;
      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
@@ -88,8 +89,8 @@ class LightPredictor {
      insts.emplace_back(op, std::move(*it));
    }
    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope());
+    CHECK(program.exec_scope_);
+    program_->set_exec_scope(program.exec_scope_);
  }

 private:

--- a/paddle/fluid/lite/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/CMakeLists.txt

 add_subdirectory(math)
+ 
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -6,4 +6,33 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
  return()
 endif()

-cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc scale.cc elementwise.cc DEPS ${lite_kernel_deps} eigen3)
+# TODO(xxx): seperate them
+cc_library(math_arm SRCS  
+    funcs.cc 
+    packed_sgemm.cc 
+    softmax.cc 
+    scale.cc
+    pooling.cc
+    elementwise.cc
+    sgemv.cc
+    type_trans.cpp
+    conv_impl.cc
+    conv_direct_3x3s1.cc
+    conv_direct_3x3s2.cc
+    conv_direct.cc
+    conv_depthwise_3x3_int7.cc
+    conv_depthwise_3x3_int8.cc
+    conv_depthwise_5x5s1_int8.cc
+    conv_depthwise_3x3p0.cc
+    conv_depthwise_3x3p1.cc
+    conv_depthwise_5x5s1.cc
+    conv_depthwise_5x5s2.cc
+    conv_depthwise.cc
+    conv_gemmlike.cc
+    conv_winograd_3x3.cc
+    conv_winograd.cc
+    split.cc
+    DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
+  # TODO(TJ): fix me do not deps proto
+ 
+ 
--- a/paddle/fluid/lite/arm/math/pooling.cc
+++ b/paddle/fluid/lite/arm/math/pooling.cc
--- a/paddle/fluid/lite/arm/math/pooling.h
+++ b/paddle/fluid/lite/arm/math/pooling.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+// !pooling fp32 Op
+void pooling_basic(const void* din, void* dout, int num, int chout, int hout,
+                   int wout, int chin, int hin, int win,
+                   const std::vector<int>& ksize,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings, bool global_pooling,
+                   bool exclusive, bool adaptive, bool ceil_mode,
+                   bool use_quantizer, const std::string& pooling_type);
+
+void pooling_global(const void* din, void* dout, int num, int chout, int hout,
+                    int wout, int chin, int hin, int win,
+                    const std::vector<int>& ksize,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings, bool global_pooling,
+                    bool exclusive, bool adaptive, bool ceil_mode,
+                    bool use_quantizer, const std::string& pooling_type);
+
+void pooling2x2s2_max(const void* din, void* dout, int num, int chout, int hout,
+                      int wout, int chin, int hin, int win,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, bool global_pooling,
+                      bool exclusive, bool adaptive, bool ceil_mode,
+                      bool use_quantizer, const std::string& pooling_type);
+
+void pooling2x2s2_ave(const void* din, void* dout, int num, int chout, int hout,
+                      int wout, int chin, int hin, int win,
+                      const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, bool global_pooling,
+                      bool exclusive, bool adaptive, bool ceil_mode,
+                      bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s1p1_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s1p1_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p1_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p0_max(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p1_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+void pooling3x3s2p0_ave(const void* din, void* dout, int num, int chout,
+                        int hout, int wout, int chin, int hin, int win,
+                        const std::vector<int>& ksize,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings, bool global_pooling,
+                        bool exclusive, bool adaptive, bool ceil_mode,
+                        bool use_quantizer, const std::string& pooling_type);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/scale.cc
+++ b/paddle/fluid/lite/arm/math/scale.cc
@@ -58,6 +58,111 @@ void scale<float>(const float* din, float* dout, int num, float scale,
  }
 }

+template <>
+void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
+                  int inner_dim, const float* scale_data,
+                  const float* bias_data) {
+  int cnt = inner_dim >> 4;
+  int remain = inner_dim % 16;
+  int size = inner_dim * scale_dim;
+  for (int n = 0; n < outer_dim; n++) {
+    const float* din_ptr_n = din + n * size;
+    float* dout_ptr_n = dout + n * size;
+#pragma omp parallel for
+    for (int i = 0; i < scale_dim; i++) {
+      const float* din_ptr = din_ptr_n + i * inner_dim;
+      float* dout_ptr = dout_ptr_n + i * inner_dim;
+      float scale = scale_data[i];
+      float32x4_t vscale = vdupq_n_f32(scale);
+      float bias = bias_data[i];
+      float32x4_t vbias = vdupq_n_f32(bias);
+      for (int j = 0; j < cnt; j++) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        float32x4_t din2 = vld1q_f32(din_ptr + 8);
+        float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+        float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
+        float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
+        float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
+        float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
+
+        din_ptr += 16;
+        vst1q_f32(dout_ptr, vsum1);
+        vst1q_f32(dout_ptr + 4, vsum2);
+        vst1q_f32(dout_ptr + 8, vsum3);
+        vst1q_f32(dout_ptr + 12, vsum4);
+
+        dout_ptr += 16;
+      }
+      for (int j = 0; j < remain; j++) {
+        *dout_ptr = *din_ptr * scale + bias;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
+template <>
+void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
+                  const float* scale_data, const float* bias_data) {
+  int cnt = scale_dim >> 4;
+  int remain = scale_dim % 16;
+  for (int n = 0; n < outer_dim; n++) {
+    const float* din_ptr_n = din + n * scale_dim;
+    float* dout_ptr_n = dout + n * scale_dim;
+#pragma omp parallel for
+    for (int i = 0; i < cnt; i++) {
+      int idx = i << 4;
+      const float* din_ptr = din_ptr_n + idx;
+      const float* scale_ptr = scale_data + idx;
+      const float* bias_ptr = bias_data + idx;
+      float* dout_ptr = dout_ptr_n + idx;
+
+      float32x4_t din0 = vld1q_f32(din_ptr);
+      float32x4_t vscale0 = vld1q_f32(scale_ptr);
+      float32x4_t vbias0 = vld1q_f32(bias_ptr);
+
+      float32x4_t din1 = vld1q_f32(din_ptr + 4);
+      float32x4_t vscale1 = vld1q_f32(scale_ptr + 4);
+      float32x4_t vbias1 = vld1q_f32(bias_ptr + 4);
+
+      float32x4_t din2 = vld1q_f32(din_ptr + 8);
+      float32x4_t vscale2 = vld1q_f32(scale_ptr + 8);
+      float32x4_t vbias2 = vld1q_f32(bias_ptr + 8);
+
+      float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0);
+      float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1);
+
+      float32x4_t din3 = vld1q_f32(din_ptr + 12);
+      float32x4_t vscale3 = vld1q_f32(scale_ptr + 12);
+      float32x4_t vbias3 = vld1q_f32(bias_ptr + 12);
+
+      vst1q_f32(dout_ptr, vsum1);
+      vst1q_f32(dout_ptr + 4, vsum2);
+
+      float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2);
+      float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3);
+
+      vst1q_f32(dout_ptr + 8, vsum3);
+      vst1q_f32(dout_ptr + 12, vsum4);
+    }
+    int idx = cnt << 4;
+    const float* din_ptr = din_ptr_n + idx;
+    float* dout_ptr = dout_ptr_n + idx;
+    const float* scale_ptr = scale_data + idx;
+    const float* bias_ptr = bias_data + idx;
+    for (int j = 0; j < remain; j++) {
+      *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr);
+      dout_ptr++;
+      din_ptr++;
+      scale_ptr++;
+      bias_ptr++;
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/paddle/fluid/lite/arm/math/scale.h
+++ b/paddle/fluid/lite/arm/math/scale.h
@@ -22,6 +22,14 @@ namespace math {
 template <typename T>
 void scale(const T* din, T* dout, int num, float scale, float bias);

+template <typename T>
+void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim,
+           const float* scale_data, const float* bias_data);
+
+template <typename T>
+void scale(const T* din, T* dout, int outer_dim, int scale_dim,
+           const float* scale_data, const float* bias_data);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/paddle/fluid/lite/arm/math/split.cc
+++ b/paddle/fluid/lite/arm/math/split.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/arm/math/split.h"
+#include <algorithm>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void split_cpy<float>(const float* din, float* dout, int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const float* din_ptr = din + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+
+    float32x4_t din0 = vld1q_f32(din_ptr);
+    float32x4_t din1 = vld1q_f32(din_ptr + 4);
+    float32x4_t din2 = vld1q_f32(din_ptr + 8);
+    float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+    vst1q_f32(dout_ptr, din0);
+    vst1q_f32(dout_ptr + 4, din1);
+    vst1q_f32(dout_ptr + 8, din2);
+    vst1q_f32(dout_ptr + 12, din3);
+  }
+  if (remain > 0) {
+    const float* din_ptr = din + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
+template <>
+void split<float>(const float* din, const std::vector<lite::Tensor*>& dout,
+                  const int axis, const std::vector<int>& in_strides) {
+  int input_offset = 0;
+  for (auto out : dout) {
+    auto out_dim = out->dims();
+    std::vector<int> out_strides(out_dim.size());
+    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
+    for (int i = out_dim.size() - 2; i >= 0; --i) {
+      out_strides[i] = out_strides[i + 1] * out_dim[i];
+    }
+
+    float* out_data = out->mutable_data<float>();
+    int before = out_strides[0] / out_strides[axis];
+    int in_after = in_strides[axis];
+    int out_after = out_strides[axis];
+
+    for (int i = 0; i < before; ++i) {
+      split_cpy(din + input_offset + i * in_after, out_data + i * out_after,
+                out_after);
+    }
+    input_offset += out_strides[axis];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/split.h
+++ b/paddle/fluid/lite/arm/math/split.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void split_cpy(const T* din, T* dout, int num);
+
+template <typename T>
+void split(const T* din, const std::vector<lite::Tensor*>& dout, const int axis,
+           const std::vector<int>& in_strides);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/type_trans.cpp
+++ b/paddle/fluid/lite/arm/math/type_trans.cpp
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -57,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
 lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
 lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
 lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
+ 
--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) {
              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
              << ", CPU ARCH: A" << dev->archs_[i];
  }
-  LOG(INFO) << "L1 DataCache size is: ";
+  VLOG(1) << "L1 DataCache size is: ";
  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB";
+    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
  }
-  LOG(INFO) << "L2 Cache size is: ";
+  VLOG(1) << "L2 Cache size is: ";
  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB";
+    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
  }
-  LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB";
+  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";

  dev->max_freq_ = max_freq[0];
  for (int j = 1; j < dev->compute_core_num_; ++j) {

--- a/paddle/fluid/lite/core/hvy_tensor.h
+++ b/paddle/fluid/lite/core/hvy_tensor.h
@@ -107,6 +107,8 @@ class TensorHvy : public TensorBase<TensorHvy> {
    data_.Resize(framework::make_ddim(dims.Vectorize()));
  }

+  void Resize(const std::vector<int64_t>& x) { Resize(DDimHvy(x)); }
+
  void ShareDataWith(const TensorHvy& other) {
    data_.ShareDataWith(other.data_);
  }

--- a/paddle/fluid/lite/core/memory.h
+++ b/paddle/fluid/lite/core/memory.h
@@ -65,6 +65,8 @@ class Buffer {
    TargetCopy(target_, data_, other.data_, nbytes);
  }

+  ~Buffer() { Free(); }
+
 private:
  // memory it actually malloced.
  size_t space_{0};

--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
        mir_passes compatible_pb_lite program_lite ${ops_lite})
 endif()
+ 
--- a/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/pattern_matcher.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void BuildGraph(SSAGraph* g) {
+  g->mutable_nodes().emplace_back();
+  Node& o1 = g->mutable_nodes().back();
+  o1.AsStmt().op_type = "op1";
+  g->mutable_nodes().emplace_back();
+  Node& o2 = g->mutable_nodes().back();
+  o2.AsStmt().op_type = "op2";
+  g->mutable_nodes().emplace_back();
+  Node& o3 = g->mutable_nodes().back();
+  o3.AsStmt().op_type = "op3";
+  g->mutable_nodes().emplace_back();
+  Node& o4 = g->mutable_nodes().back();
+  o4.AsStmt().op_type = "op4";
+  g->mutable_nodes().emplace_back();
+  Node& o5 = g->mutable_nodes().back();
+  o5.AsStmt().op_type = "op5";
+  g->mutable_nodes().emplace_back();
+  Node& v1 = g->mutable_nodes().back();
+  v1.AsArg("var1");
+  g->mutable_nodes().emplace_back();
+  Node& v2 = g->mutable_nodes().back();
+  v2.AsArg("var2");
+  g->mutable_nodes().emplace_back();
+  Node& v3 = g->mutable_nodes().back();
+  v3.AsArg("var3");
+  g->mutable_nodes().emplace_back();
+  Node& v4 = g->mutable_nodes().back();
+  v4.AsArg("var4");
+
+  // o1->v1->o2
+  o1.outlinks.push_back(&v1);
+  o2.inlinks.push_back(&v1);
+  v1.inlinks.push_back(&o1);
+  v1.outlinks.push_back(&o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2.outlinks.push_back(&v2);
+  o3.inlinks.push_back(&v2);
+  o4.inlinks.push_back(&v2);
+  v2.inlinks.push_back(&o2);
+  v2.outlinks.push_back(&o3);
+  v2.outlinks.push_back(&o4);
+  // o2->v3->o5
+  o2.outlinks.push_back(&v3);
+  o5.inlinks.push_back(&v3);
+  v3.inlinks.push_back(&o2);
+  v3.outlinks.push_back(&o5);
+  // o3-v4->o5
+  o3.outlinks.push_back(&v4);
+  o5.inlinks.push_back(&v4);
+  v4.inlinks.push_back(&o3);
+  v4.outlinks.push_back(&o5);
+}
+
+TEST(PMPattern, NewNode) {
+  PMPattern x;
+  auto* n = x.NewNode([](const Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+
+TEST(PMPattern, AddEdge) {
+  PMPattern x;
+  auto* a = x.NewNode([](const Node* x) { return true; });
+  auto* b = x.NewNode([](const Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+
+TEST(PatternMatcher, MarkPMNodesInGraph) {
+  PatternMatcher x;
+  // mark o2, o3, v2
+
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsStmt() && node->stmt()->op_type == "op2";
+  });
+  auto* o3 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsStmt() && node->stmt()->op_type == "op3";
+  });
+  auto* v2 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsArg() && node->arg()->name == "var2";
+  });
+
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  x.MarkPMNodesInGraph(&graph);
+
+  ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL);
+
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+
+TEST(PatternMatcher, MultiSubgraph) {
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  PatternMatcher x;
+
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](const Node* node) {
+        return node->IsStmt() && (node->stmt()->op_type == "op2" ||
+                                  node->stmt()->op_type == "op3");
+      },
+      "OP0");
+  auto* any_var =
+      x.mutable_pattern()
+          ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR")
+          ->AsIntermediate();
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](const Node* node) { return node->IsStmt(); }, "OP1");
+
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+
+  int count = 0;
+  PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
+                                        SSAGraph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
+              << s.at(any_var)->arg()->name << " -> "
+              << s.at(any_op1)->stmt()->op_type;
+    count++;
+  };
+
+  x(&graph, handle);
+
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1);
+  ASSERT_LE(count, 2);
+}
+
+TEST(PatternMatcher, IntermediateCheck) {
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  // o2->v2->o3
+  // o2->v2->o4
+  // check o2+o3 fuse, should fail because v2 also link to o4.
+  PatternMatcher matcher;
+  auto* op2 = matcher.mutable_pattern()->NewNode(
+      [](const Node* x) {
+        return x && x->IsStmt() && x->stmt()->op_type == "op2";
+      },
+      "op2");
+  auto* op3 = matcher.mutable_pattern()->NewNode(
+      [](const Node* x) {
+        return x && x->IsStmt() && x->stmt()->op_type == "op3";
+      },
+      "op3");
+  auto* v2 = matcher.mutable_pattern()
+                 ->NewNode(
+                     [](const Node* x) {
+                       return x && x->IsArg() && x->arg()->name == "var2";
+                     },
+                     "var2")
+                 ->AsIntermediate();
+  v2->LinksFrom({op2}).LinksTo({op3});
+
+  int count = 0;
+  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
+    ++count;
+  });
+  EXPECT_EQ(count, 0);
+
+  count = 0;
+  v2->AsInput();
+  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
+    ++count;
+  });
+  ASSERT_EQ(count, 1);
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/core/op_registry.h
+++ b/paddle/fluid/lite/core/op_registry.h
@@ -91,9 +91,9 @@ class KernelRegistry final {
  void Register(const std::string &name,
                typename KernelRegistryForTarget<Target, Precision,
                                                 Layout>::creator_t &&creator) {
-    // VLOG(3) << "register for " << TargetToStr(Target) << ":"
-    //<< PrecisionToStr(Precision) << "//"
-    //<< GetKernelOffset<Target, Precision, Layout>();
+    VLOG(3) << "register for " << TargetToStr(Target) << ":"
+            << PrecisionToStr(Precision) << "//"
+            << GetKernelOffset<Target, Precision, Layout>();
    using kernel_registor_t =
        KernelRegistryForTarget<Target, Precision, Layout>;
    auto &varient = registries_[GetKernelOffset<Target, Precision, Layout>()];
@@ -153,6 +153,9 @@ class KernelRegistor : public lite::Registor<KernelType> {
 public:
  KernelRegistor(const std::string &op_type, const std::string &alias)
      : Registor<KernelType>([=] {
+          VLOG(3) << "Register kernel " << op_type << " for "
+                  << TargetToStr(target) << " " << PrecisionToStr(precision)
+                  << " " << DataLayoutToStr(layout) << " alias " << alias;
          KernelRegistry::Global().Register<target, precision, layout>(
              op_type, [=]() -> std::unique_ptr<KernelType> {
                std::unique_ptr<KernelType> x(new KernelType);

--- a/paddle/fluid/lite/core/profile/CMakeLists.txt
+++ b/paddle/fluid/lite/core/profile/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()

 lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
+ 
--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -21,6 +21,7 @@
 * looks the same.
 */

+#include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/target_wrapper.h"


--- a/paddle/fluid/lite/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/cuda/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()

 nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
 nv_library(cuda_blas_lite SRCS blas.cc)
+ 
--- a/paddle/fluid/lite/gen_code/CMakeLists.txt
+++ b/paddle/fluid/lite/gen_code/CMakeLists.txt
@@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        DEPS scope_lite op_lite kernel_lite paddle_infer_gencode
    )

-    lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
-      ${ops_lite} ${host_kernels}
-      X86_DEPS ${x86_kernels}
-      )
+  #    lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
+  #    ${ops_lite} ${host_kernels}
+  #    X86_DEPS ${x86_kernels}
+  #    )

-    add_dependencies(__generated_code__ test_gen_code_lite)
+  #  add_dependencies(__generated_code__ test_gen_code_lite)
 endif()
+ 
--- a/paddle/fluid/lite/host/CMakeLists.txt
+++ b/paddle/fluid/lite/host/CMakeLists.txt
 cc_library(target_wrapper_host SRCS target_wrapper.cc)
+ 
--- a/paddle/fluid/lite/kernels/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(arm)
 add_subdirectory(cuda)
 add_subdirectory(x86)
 
+ 
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -6,15 +6,24 @@ message(STATUS "compile with lite ARM kernels")

 cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
+cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)

 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
+lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
+lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
 lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
+lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
+lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
+lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)

 set(arm_kernels
    fc_compute_arm
@@ -22,6 +31,13 @@ set(arm_kernels
    mul_compute_arm
    scale_compute_arm
    softmax_compute_arm
-	elementwise_add_compute_arm)
+    conv_compute_arm
+    batch_norm_compute_arm
+    elementwise_add_compute_arm
+    pool_compute_arm
+    split_compute_arm
+    )

 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
+ 
+ 
--- a/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void BatchNormCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    int64_t channel_size = 0;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW):
+        channel_size = x_dims[1];
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+    new_scale.Resize({channel_size});
+    new_bias.Resize({channel_size});
+    auto* scale_data = param.scale->mutable_data<float>();
+    auto* bias_data = param.bias->mutable_data<float>();
+    auto* mean_data = param.mean->mutable_data<float>();
+    auto* variance_data = param.variance->mutable_data<float>();
+    auto* new_scale_data = new_scale.mutable_data<float>();
+    auto* new_bias_data = new_bias.mutable_data<float>();
+    for (int c = 0; c < channel_size; c++) {
+      float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon));
+      new_bias_data[c] =
+          bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+      new_scale_data[c] = inv_scale * scale_data[c];
+    }
+  }
+}
+
+void BatchNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto x_data = param.x->mutable_data<float>();
+  auto y_data = param.y->mutable_data<float>();
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    auto* new_scale_data = new_scale.mutable_data<float>();
+    auto* new_bias_data = new_bias.mutable_data<float>();
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW):
+        outer_size = x_dims[0];
+        channel_size = x_dims[1];
+        inner_size = x_dims.Slice(2, x_dims.size()).production();
+        lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
+                               inner_size, new_scale_data, new_bias_data);
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
+      //                          new_scale_data, new_bias_data);
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  } else {
+    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
+    // saved_variance
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::BatchNormCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/batch_norm_compute.h
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class BatchNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~BatchNormCompute() = default;
+
+ private:
+  Tensor new_scale;
+  Tensor new_bias;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
--- a/paddle/fluid/lite/kernels/arm/conv_compute.h
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~ConvCompute() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+  }
+
+ private:
+  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kFloat), param_t>* impl_{
+      nullptr};
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
--- a/paddle/fluid/lite/kernels/arm/fc_compute.h
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.h
@@ -25,10 +25,9 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
 public:
  using param_t = operators::FcParam;

-  void Run() override;
+  void PrepareForRun() override;

-  TargetType target() const override;
-  PrecisionType precision() const override;
+  void Run() override;

  virtual ~FcCompute() = default;
 };

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
--- a/paddle/fluid/lite/kernels/arm/mul_compute.h
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.h
--- a/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
--- a/paddle/fluid/lite/kernels/arm/pool_compute.h
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.h
--- a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/split_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/split_compute.cc
--- a/paddle/fluid/lite/kernels/arm/split_compute.h
+++ b/paddle/fluid/lite/kernels/arm/split_compute.h
--- a/paddle/fluid/lite/kernels/arm/split_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/split_compute_test.cc
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ b/paddle/fluid/lite/kernels/arm/use_kernels.h
--- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
--- a/paddle/fluid/lite/model_parser/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/CMakeLists.txt
--- a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
 cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
+ 
--- a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
--- a/paddle/fluid/lite/operators/batch_norm_op.cc
+++ b/paddle/fluid/lite/operators/batch_norm_op.cc
--- a/paddle/fluid/lite/operators/batch_norm_op.h
+++ b/paddle/fluid/lite/operators/batch_norm_op.h
--- a/paddle/fluid/lite/operators/batch_norm_op_test.cc
+++ b/paddle/fluid/lite/operators/batch_norm_op_test.cc
--- a/paddle/fluid/lite/operators/conv_op.cc
+++ b/paddle/fluid/lite/operators/conv_op.cc
--- a/paddle/fluid/lite/operators/conv_op.h
+++ b/paddle/fluid/lite/operators/conv_op.h
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
--- a/paddle/fluid/lite/operators/pool_op.cc
+++ b/paddle/fluid/lite/operators/pool_op.cc
--- a/paddle/fluid/lite/operators/pool_op.h
+++ b/paddle/fluid/lite/operators/pool_op.h
--- a/paddle/fluid/lite/operators/pool_op_test.cc
+++ b/paddle/fluid/lite/operators/pool_op_test.cc
--- a/paddle/fluid/lite/operators/split_op.cc
+++ b/paddle/fluid/lite/operators/split_op.cc
--- a/paddle/fluid/lite/operators/split_op.h
+++ b/paddle/fluid/lite/operators/split_op.h
--- a/paddle/fluid/lite/tools/Dockerfile.mobile
+++ b/paddle/fluid/lite/tools/Dockerfile.mobile
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
--- a/paddle/fluid/lite/tools/mobile_readme.md
+++ b/paddle/fluid/lite/tools/mobile_readme.md
--- a/paddle/fluid/lite/utils/CMakeLists.txt
+++ b/paddle/fluid/lite/utils/CMakeLists.txt
--- a/paddle/fluid/lite/utils/any.h
+++ b/paddle/fluid/lite/utils/any.h
--- a/paddle/fluid/lite/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/x86/CMakeLists.txt