move arm op kernels to central_arm_func

ddfa00ac · wangliu · 6d5281de · ddfa00ac · ddfa00ac · ddfa00ac
21 changed file
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -3,7 +3,6 @@
 #### 以下是 paddle-mobile 代码的执行流程图:
 ![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
@@ -15,7 +14,6 @@
 先来看一下模型, 模型分为两种结构:
 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
 ![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
@@ -23,6 +21,7 @@
 ![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
 loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
 方便进行算法优化.

--- a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -23,7 +23,17 @@ namespace framework {
 class Scope {
 public:
  Scope() = default;
-  ~Scope() = default;
+  ~Scope() {
+    for (auto &var : vars_) {
+      delete var.second;
+    }
+    vars_.clear();
+    for (auto kid : kids_) {
+      delete kid;
+    }
+    kids_.clear();
+  }
  Scope &NewScope() const;

--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -54,13 +54,14 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
                                                          jstring modelPath) {
+  ANDROIDLOGI("load invoked");
  bool optimize = true;
  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
                                         optimize);
 }
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+JNIEXPORT jfloatArray JNICALL
-    JNIEnv *env, jclass thiz, jfloatArray buf) {
+Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) {
  jfloatArray result = NULL;
  int count = 0;
  float *dataPointer = nullptr;
@@ -78,6 +79,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  ANDROIDLOGI("predict finished");
  return result;
 }

--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -31,8 +31,8 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
 /**
 * object detection for anroid
 */
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+JNIEXPORT jfloatArray JNICALL
-    JNIEnv *env, jclass thiz, jfloatArray buf);
+Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf);
 /**
 * clear data of the net when destroy for android

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 #include "operators/kernel/conv_add_kernel.h"
+#include "../central-arm-func/conv_add_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -23,111 +24,9 @@ bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam *param) {
  return true;
 }
-void ConvAddBasic(const FusionConvAddParam &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int axis = param.Axis();
-  Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1));
-    }
-  }
-}
 template <>
 void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
-  if (param.Groups() == param.Input()->dims()[1] &&
+  ConvAddCompute<float>(param);
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3) {
-    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), param.Bias(), param.Output(), true);
-  } else {
-    ConvAddBasic(param);
-  }
 }
 template class ConvAddKernel<CPU, float>;

--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -14,27 +14,11 @@ limitations under the License. */
 #ifdef POOL_OP
-#include <operators/kernel/pool_kernel.h>
+#include "operators/kernel/pool_kernel.h"
-#include "common/log.h"
+#include "../central-arm-func/pool_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-                      std::vector<int> strides, std::vector<int> paddings,
-                      const Tensor *in_x, Tensor *out) {
-  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
-    math::MaxPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
-    math::AvgPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  }
-}
 template <>
 bool PoolKernel<CPU, float>::Init(PoolParam *param) {
  return true;
@@ -42,54 +26,7 @@ bool PoolKernel<CPU, float>::Init(PoolParam *param) {
 template <>
 void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
-  const Tensor *in_x = param.Input();
+  PoolCompute<float>(param);
-  Tensor *out = param.Output();
-  std::string pooling_type = param.PoolingType();
-  std::vector<int> ksize = param.Ksize();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  if (ksize.size() != 2) {
-    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-        << "Pool op only supports 2D and 3D input.";
-  }
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-    }
-  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      if (strides[0] == strides[1] && strides[0] == 1 &&
-          paddings[0] == paddings[1] && paddings[1] == 1) {
-        math::Pool3x3Maxs1p1(in_x, out);
-      } else {
-        math::Pool3x3Max(strides, paddings, in_x, out);
-      }
-      math::Pool3x3Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      if (strides[0] == strides[1] && strides[0] == 1 &&
-          paddings[0] == paddings[1] && paddings[1] == 1) {
-        math::Pool3x3Avgs1p1(in_x, out);
-      } else {
-        math::Pool3x3Avg(strides, paddings, in_x, out);
-      }
-      math::Pool3x3Avg(strides, paddings, in_x, out);
-    }
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool2x2Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool2x2Avg(strides, paddings, in_x, out);
-    }
-  } else {
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef SIGMOID_OP
 #include "../sigmoid_kernel.h"
+#include "../central-arm-func/sigmoid_arm_func.h"
 #if __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
@@ -25,52 +26,6 @@ namespace operators {
 using framework::DDim;
 using framework::Tensor;
-void sigmoid(const Tensor *X, Tensor *Y) {
-#if __ARM_NEON
-  const float *input = X->data<float>();
-  float *output = Y->mutable_data<float>();
-  const DDim &dDim = X->dims();
-  int axis_index = 1;
-  if (dDim.size() < 4) {
-    axis_index = 0;
-  }
-  DDim outer_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-  int out_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-  DLOG << "outsize=" << out_size;
-  DLOG << "innersize=" << inner_size;
-  #pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    const float *input_outer_ptr = input + i * inner_size;
-    float *output_outer_ptr = output + i * inner_size;
-    int nn = inner_size >> 2;
-    int remain = inner_size - (nn << 2);
-    float32x4_t _one = vdupq_n_f32(1.f);
-    for (; nn > 0; nn--) {
-      float32x4_t data = vld1q_f32(input_outer_ptr);
-      data = vnegq_f32(data);
-      data = exp_ps(data);
-      data = vaddq_f32(data, _one);
-      float32x4_t out_data = vrecpeq_f32(data);
-      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
-      vst1q_f32(output_outer_ptr, out_data);
-      input_outer_ptr += 4;
-      output_outer_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
-      output_outer_ptr++;
-      input_outer_ptr++;
-    }
-  }
-#endif
-}
 template <>
 bool SigmoidKernel<CPU, float>::Init(SigmoidParam *param) {
  return true;
@@ -78,11 +33,7 @@ bool SigmoidKernel<CPU, float>::Init(SigmoidParam *param) {
 template <>
 void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
-  const Tensor *in_x = param.InputX();
+  SigmoidCompute<float>(param);
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  sigmoid(in_x, out);
 }
 template class SigmoidKernel<CPU, float>;

--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -15,7 +15,8 @@ limitations under the License. */
 #ifdef SOFTMAX_OP
 #include "../softmax_kernel.h"
-#include "../../math/softmax.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "operators/math/softmax.h"
 namespace paddle_mobile {
 namespace operators {
@@ -26,11 +27,7 @@ bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam *param) {
 template <>
 void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
-  const Tensor *in_x = param.InputX();
+  SoftmaxCompute<float>(param);
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
 }
 template class SoftmaxKernel<CPU, float>;

--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBasic(const FusionConvAddParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
+    }
+  }
+}
+template <typename P>
+void ConvAddCompute(const FusionConvAddParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               param.Bias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), param.Bias(), param.Output(), true);
+  } else {
+    ConvAddBasic(param);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -15,19 +15,21 @@ limitations under the License. */
 #ifdef CONV_OP
 #pragma once
-#include <operators/math/depthwise_conv_3x3.h>
 #include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 inline void ConvBasic(const ConvParam &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
+  output->mutable_data<float>();
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
@@ -111,20 +113,18 @@ inline void ConvBasic(const ConvParam &param) {
 template <typename P>
 void ConvCompute(const ConvParam &param) {
-  Tensor Bias;
-  Bias.mutable_data<float>({param.Groups()});
  if (param.Groups() == param.Input()->dims()[1] &&
      param.Input()->dims()[1] == param.Output()->dims()[1] &&
      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               &Bias, false);
+                               nullptr, false);
  } else if (param.Groups() == param.Input()->dims()[1] &&
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+             param.Filter()->dims()[2] == 3) {
    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), &Bias, param.Output(), false);
+                           param.Filter(), nullptr, param.Output(), false);
  } else {
    ConvBasic(param);
  }

--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "operators/math/pooling.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::Tensor;
+inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
+                      std::vector<int> strides, std::vector<int> paddings,
+                      const Tensor *in_x, Tensor *out) {
+  if (pooling_type == "max") {
+    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
+    math::MaxPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+  } else if (pooling_type == "avg") {
+    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
+    math::AvgPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+  }
+}
+template <typename P>
+void PoolCompute(const PoolParam &param) {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  std::string pooling_type = param.PoolingType();
+  std::vector<int> ksize = param.Ksize();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  if (ksize.size() != 2) {
+    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
+        << "Pool op only supports 2D and 3D input.";
+  }
+  if (param.isGlobalPooling()) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Maxs1p1(in_x, out);
+      } else {
+        math::Pool3x3Max(strides, paddings, in_x, out);
+      }
+    } else if (pooling_type == "avg") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Avgs1p1(in_x, out);
+      } else {
+        math::Pool3x3Avg(strides, paddings, in_x, out);
+      }
+    }
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool2x2Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avg(strides, paddings, in_x, out);
+    }
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SIGMOID_OP
+#pragma once
+#include "operators/op_param.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "operators/math/math_func_neon.h"
+#endif
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+void sigmoid(const Tensor *X, Tensor *Y) {
+#if __ARM_NEON
+  const float *input = X->data<float>();
+  float *output = Y->mutable_data<float>();
+  const DDim &dDim = X->dims();
+  int axis_index = 1;
+  if (dDim.size() < 4) {
+    axis_index = 0;
+  }
+  DDim outer_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
+  int out_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+  DLOG << "outsize=" << out_size;
+  DLOG << "innersize=" << inner_size;
+  #pragma omp parallel for
+  for (int i = 0; i < out_size; ++i) {
+    const float *input_outer_ptr = input + i * inner_size;
+    float *output_outer_ptr = output + i * inner_size;
+    int nn = inner_size >> 2;
+    int remain = inner_size - (nn << 2);
+    float32x4_t _one = vdupq_n_f32(1.f);
+    for (; nn > 0; nn--) {
+      float32x4_t data = vld1q_f32(input_outer_ptr);
+      data = vnegq_f32(data);
+      data = exp_ps(data);
+      data = vaddq_f32(data, _one);
+      float32x4_t out_data = vrecpeq_f32(data);
+      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
+      vst1q_f32(output_outer_ptr, out_data);
+      input_outer_ptr += 4;
+      output_outer_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
+      output_outer_ptr++;
+      input_outer_ptr++;
+    }
+  }
+#endif
+}
+template <typename P>
+void SigmoidCompute(const SigmoidParam &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  sigmoid(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#pragma once
+#include "../../math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void SoftmaxCompute(const SoftmaxParam &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #pragma once
 #include "framework/operator.h"
-#include "operators/math/pooling.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -23,8 +23,6 @@ namespace paddle_mobile {
 namespace operators {
 using framework::OpKernelBase;
-void simoid(Tensor *X, Tensor *Y);
 template <typename DeviceType, typename T>
 class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
 public:

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -245,7 +245,10 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
-  const float *bias_data = bias->data<float>();
+  const float *bias_data;
+  if (if_bias) {
+    bias_data = bias->data<float>();
+  }
  const int h = static_cast<int>(input->dims()[2]);
  const int w = static_cast<int>(input->dims()[3]);

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef POOL_OP
-#include "operators/math/pool_3x3.h"
+#include "pool_3x3.h"
-#include <climits>
 #include "framework/tensor.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+#include <climits>
 namespace paddle_mobile {
 namespace operators {
 namespace math {

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -195,8 +195,7 @@ class OpParam {
 class ConvParam : OpParam {
 public:
  ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
+            const AttributeMap &attrs, const Scope &scope) {
-            const framework::Scope &scope) {
    filter_ = FilterFrom<LoDTensor>(inputs, scope);
    input_ = InputFrom<LoDTensor>(inputs, scope);
    output_ = OutputFrom<LoDTensor>(outputs, scope);
@@ -237,12 +236,11 @@ Print &operator<<(Print &printer, const ConvParam &conv_param);
 class ElementwiseAddParam : OpParam {
 public:
  ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      const framework::AttributeMap &attrs,
+                      const Scope &scope) {
-                      const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
    axis_ = GetAttr<int>("axis", attrs);
  }
@@ -267,11 +265,10 @@ class ElementwiseAddParam : OpParam {
 class MulParam : OpParam {
 public:
  MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
+           const AttributeMap &attrs, const Scope &scope) {
-           const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
  }
@@ -299,10 +296,9 @@ class MulParam : OpParam {
 class ConcatParam : public OpParam {
 public:
  ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
+              const AttributeMap &attrs, const Scope &scope) {
-              const framework::Scope &scope) {
    inputs_ = InputMultiFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
    axis_ = GetAttr<int>("axis", attrs);
  }
@@ -323,11 +319,10 @@ class ConcatParam : public OpParam {
 class LrnParam : public OpParam {
 public:
  LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
+           const AttributeMap &attrs, const Scope &scope) {
-           const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    mid_out_ = MidOutFrom<LoDTensor>(outputs, scope);
-    mid_out_ = MidOutFrom<framework::LoDTensor>(outputs, scope);
    n_ = GetAttr<int>("n", attrs);
    alpha_ = GetAttr<float>("alpha", attrs);
    beta_ = GetAttr<float>("beta", attrs);
@@ -367,14 +362,13 @@ class LrnParam : public OpParam {
 class BatchNormParam : OpParam {
 public:
  BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
+                 const AttributeMap &attrs, const Scope &scope) {
-                 const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
-    output_y_ = OutputYFrom<framework::LoDTensor>(outputs, scope);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
-    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<framework::LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<framework::LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<framework::LoDTensor>(inputs, scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
    momentum_ = GetAttr<float>("momentum", attrs);
    is_test_ = GetAttr<bool>("is_test", attrs);
@@ -418,11 +412,10 @@ class BatchNormParam : OpParam {
 class PoolParam : public OpParam {
 public:
  PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
+            const AttributeMap &attrs, const Scope &scope) {
-            const framework::Scope &scope) {
+    input_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    output_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
    pooling_type_ = GetAttr<string>("pooling_type", attrs);
    ksize_ = GetAttr<vector<int>>("ksize", attrs);
    strides_ = GetAttr<vector<int>>("strides", attrs);
@@ -464,13 +457,11 @@ class PoolParam : public OpParam {
 class PriorBoxParam : public OpParam {
 public:
  PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
+                const AttributeMap &attrs, const Scope &scope) {
-                const framework::Scope &scope) {
+    input_ = InputFrom<LoDTensor>(inputs, scope);
-    input_ = InputFrom<framework::LoDTensor>(inputs, scope);
+    input_image_ = InputImageFrom<LoDTensor>(inputs, scope);
-    input_image_ = InputImageFrom<framework::LoDTensor>(inputs, scope);
+    output_boxes_ = OutputBoxesFrom<LoDTensor>(outputs, scope);
-    output_boxes_ = OutputBoxesFrom<framework::LoDTensor>(outputs, scope);
+    output_variances_ = OutputVariancesFrom<LoDTensor>(outputs, scope);
-    output_variances_ =
-        OutputVariancesFrom<framework::LoDTensor>(outputs, scope);
    min_sizes_ = GetAttr<vector<float>>("min_sizes", attrs);
    max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
    aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
@@ -528,13 +519,11 @@ class PriorBoxParam : public OpParam {
 class BoxCoderParam : public OpParam {
 public:
  BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
+                const AttributeMap &attrs, const Scope &scope) {
-                const framework::Scope &scope) {
+    input_priorbox_ = InputPriorBoxFrom<LoDTensor>(inputs, scope);
-    input_priorbox_ = InputPriorBoxFrom<framework::LoDTensor>(inputs, scope);
+    input_priorboxvar_ = InputPriorBoxVarFrom<LoDTensor>(inputs, scope);
-    input_priorboxvar_ =
+    input_targetbox_ = InputTargetBoxFrom<LoDTensor>(inputs, scope);
-        InputPriorBoxVarFrom<framework::LoDTensor>(inputs, scope);
+    output_box_ = OutputBoxFrom<LoDTensor>(outputs, scope);
-    input_targetbox_ = InputTargetBoxFrom<framework::LoDTensor>(inputs, scope);
-    output_box_ = OutputBoxFrom<framework::LoDTensor>(outputs, scope);
    code_type_ = GetAttr<std::string>("code_type", attrs);
  }
  const Tensor *InputPriorBox() const { return input_priorbox_; }
@@ -560,10 +549,9 @@ class BoxCoderParam : public OpParam {
 class SoftmaxParam : public OpParam {
 public:
  SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
+               const AttributeMap &attrs, const Scope &scope) {
-               const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
  }
  const Tensor *InputX() const { return input_x_; }
  Tensor *Out() const { return out_; }
@@ -578,10 +566,9 @@ class SoftmaxParam : public OpParam {
 class SigmoidParam : public OpParam {
 public:
  SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
+               const AttributeMap &attrs, const Scope &scope) {
-               const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
  }
  const Tensor *InputX() const { return input_x_; }
  Tensor *Out() const { return out_; }
@@ -643,9 +630,9 @@ class MultiClassNMSParam : public OpParam {
 class FeedParam : public OpParam {
 public:
  FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope &scope) {
+            const AttributeMap &attrs, Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
    auto var = scope.Var("batch_size");
    batch_size = var->GetValue<int>();
  }
@@ -662,10 +649,9 @@ class FeedParam : public OpParam {
 class FetchParam : public OpParam {
 public:
  FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
+             const AttributeMap &attrs, const Scope &scope) {
-             const framework::Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
  }
  const Tensor *InputX() const { return input_x_; }
  Tensor *Out() const { return out_; }
@@ -863,10 +849,10 @@ class FusionConvAddBNReluParam : public OpParam {
    paddings_ = GetAttr<vector<int>>("paddings", attrs);
    dilations_ = GetAttr<vector<int>>("dilations", attrs);
    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<framework::LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<framework::LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<framework::LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
    momentum_ = GetAttr<float>("momentum", attrs);
    is_test_ = GetAttr<bool>("is_test", attrs);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -17,25 +17,25 @@ limitations under the License. */
 #include "../test_include.h"
 int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  bool optimize = true;
  auto time1 = time();
  //  auto program = loader.Load(g_googlenet, optimize);
-  if (paddle_mobile.Load(g_googlenet_combine + "/model",
+  auto program = loader.Load(g_googlenet_combine + "/model",
-                         g_googlenet_combine + "/params", optimize)) {
+                             g_googlenet_combine + "/params", optimize);
-    auto time2 = time();
+  auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
+  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
-    std::vector<float> input;
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
-    std::vector<int64_t> dims{1, 3, 224, 224};
+  std::vector<float> input;
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+  std::vector<int64_t> dims{1, 3, 224, 224};
-    auto time3 = time();
+  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+  auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
+  for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
+    executor.Predict(input, dims);
-    }
-    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
  }
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
  return 0;
 }
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -32,8 +32,8 @@ build_for_mac() {
 build_for_android() {
    #rm -rf "../build"
-    if [ -z "${ANDROID_NDK}" ]; then
+    if [ -z "${NDK_ROOT}" ]; then
-        echo "ANDROID_NDK not found!"
+        echo "NDK_ROOT not found!"
        exit -1
    fi