diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9382f9697e5c39c7e7195b94fc5332d39dc18a9..f60846e98aa9ca36bd6bd68cccdda6e3d2ff616a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
-add_definitions(-DPADDLE_MOBILE_DEBUG="true")
+add_definitions(-DPADDLE_MOBILE_DEBUG)
+add_definitions(-DENABLE_EXCEPTION)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(CMAKE_BUILD_TYPE RelWithDebInfo)
diff --git a/scripts/push2android.sh b/scripts/push2android.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44b0ee32e99ccddf5cc6060882dc37158c149693
--- /dev/null
+++ b/scripts/push2android.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env sh
+
+push_fn () {
+MODELS_PATH="../test/models/*"
+EXE_FILE="../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+MODELS_DIR="data/local/tmp/models"
+LIB_PATH="../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+echo "test files sync completed"
+}
+push_fn
diff --git a/src/common/enforce.h b/src/common/enforce.h
index abd6217fbeb62a09a7d4a9fe9b6d85d0ab1cf2d8..52bda2258a00c7444762fe8297380c1c7752dd42 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_MOBILE_DEBUG
+#ifdef ENABLE_EXCEPTION
 #include <stdio.h>
 #include <exception>
 #include <sstream>
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-#ifdef PADDLE_MOBILE_DEBUG
+#ifdef ENABLE_EXCEPTION
 struct PaddleMobileException : public std::exception {
   const std::string exception_prefix = "paddle mobile C++ Exception: \n";
   std::string message;
@@ -64,7 +64,7 @@ struct PaddleMobileException : public std::exception {
   }
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ASSERT(stat, ...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
 #endif
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index b25ae329931a79e9dd73b4a213a86aacc2464787..ca9e64cc60f067e952dd481ce978b7b1a5bc5f13 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -95,17 +95,23 @@ static const std::string G_OP_TYPE_FETCH = "fetch";
 
 static std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-                           {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-                           {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-                           {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-                           {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}};
-
+    op_input_output_key = {
+        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BOX_CODER,
+         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
+        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 808002d4c8f3193744ef68c1db881a787d19b133..46feb97cb879bb092e1a8f6955f7f50712f9ea3b 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,18 +28,6 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
   return it->second.second;
 }
 
-template <typename T>
-static T *GetVarValue(const string &key, const VariableNameMap &var_map,
-                      const Scope &scope) {
-  auto var_vec = var_map.at(key);
-  if (!var_vec.empty()) {
-    auto var = scope.FindVar(var_vec[0]);
-    return var->GetMutable<T>();
-  } else {
-    return nullptr;
-  }
-}
-
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                   const VariableNameMap &inputs,
@@ -60,7 +48,7 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
 template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
+#if (PADDLE_MOBILE_DEBUG)
   vector<string> output_keys = GetOutKeys();
   for (const auto key : output_keys) {
     Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 6e5a2b089db780b3b52ca6b0c2caa68f35fdfc9e..cb27985244a1dd9e92a54edce9f15fd3d8defaad 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -38,6 +38,19 @@ namespace paddle_mobile {
 namespace framework {
 using std::string;
 using std::vector;
+
+template <typename T>
+static T *GetVarValue(const string &key, const VariableNameMap &var_map,
+                      const Scope &scope) {
+  auto var_vec = var_map.at(key);
+  if (!var_vec.empty()) {
+    auto var = scope.FindVar(var_vec[0]);
+    return var->GetMutable<T>();
+  } else {
+    return nullptr;
+  }
+}
+
 template <typename Dtype>
 class OperatorBase : PaddleMobileObject {
  public:
diff --git a/src/io.cpp b/src/io.cpp
index 002e73b79648320c229786f8492f4c0e8b299d83..ac89106e4988b35d54a59991009a98739e50d024 100644
--- a/src/io.cpp
+++ b/src/io.cpp
@@ -220,7 +220,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
       }
     }
   }
-  //  originProgramDesc->Description("program: ");
+  originProgramDesc->Description("program: ");
 
   if (optimize) {
     framework::ProgramOptimize program_optimize;
@@ -371,31 +371,47 @@ void Executor<Dtype, P>::InitMemory() {
 }
 
 template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict(const framework::Tensor &t, int block_id) {
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t) {
   framework::Variable *g_feed_value = program_.scope->Var("feed");
   framework::Tensor *feed_tensor =
       g_feed_value->GetMutable<framework::LoDTensor>();
   feed_tensor->Resize(t.dims());
   feed_tensor->ShareDataWith(t);
   std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(block_id);
+      to_predict_program_->Block(0);
   for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
     auto op = ops_of_block_[*to_predict_block.get()][j];
     op->Run();
   }
+  auto ops = ops_of_block_[*to_predict_program_->Block(0)];
+  auto last_op = ops.rbegin();
+  auto output_map = (*last_op)->Outputs();
+  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
+  framework::LoDTensor *output_tensor =
+      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
+                                                   *(program_.scope));
+  return std::shared_ptr<framework::Tensor>(output_tensor);
+}
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t, int block_id) {
+  return Predict(t);
 }
 
 template <typename Dtype, Precision P>
 std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
     const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
   framework::Tensor tensor(input, framework::make_ddim(dims));
-
-  Predict(tensor, 0);
-
-  framework::Variable *g_feed_value = program_.scope->Var("col");
-  auto feed_tensor = g_feed_value->GetMutable<framework::Tensor>();
-
-  return {};
+  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
+  Executor<Dtype, P>::Ptype *output_ptr =
+      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+  for (int j = 0; j < output_tensor->numel(); ++j) {
+    result_vector.push_back(output_ptr[j]);
+  }
+  return result_vector;
 }
 
 template class Executor<CPU, Precision::FP32>;
diff --git a/src/io.h b/src/io.h
index de2d359bf58d1ad328defd2f51e87e2d6bfe6295..ae99197baa97c07d2a883f8721d533b85ab7873a 100644
--- a/src/io.h
+++ b/src/io.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory.h>
+#include <map>
 #include <string>
 #include <vector>
 
@@ -44,24 +45,25 @@ class Executor {
  public:
   typedef typename PrecisionTrait<P>::ptype Ptype;
 
-  Executor() = default;
-
   Executor(const framework::Program<Dtype> p, int batch_size = 1,
            bool use_optimize = true);
 
-  //  std::shared_ptr<framework::Tensor> Predict(framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
 
   std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
  protected:
+  Executor() = default;
+
   void InitMemory();
   void LoadMemory(const framework::VarDesc var_desc,
                   framework::LoDTensor *tensor, const std::string &file_path);
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
   std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  void Predict(const framework::Tensor &t, int block_id);
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
+                                             int block_id);
   std::map<framework::BlockDesc,
            std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
       ops_of_block_;
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 148b0f69f9633f1d82979ab324c5997fb6fcb1c1..bfddcf14acbba016c4e4333e05fcc7dd6eebc509 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -21,13 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
 template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
   //  std::cout << " begin get dims: " << std::endl;
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index 1557f2f06eed8237f7b7e9ff44adc233129a49a3..f15f286b606db1403b0e0e609bfc38caac2c5105 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -44,5 +44,12 @@ class ConvOp : public framework::OperatorWithKernel<DeviceType> {
   ConvParam param_;
 };
 
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2538298175c5ea40d7e44338caee853a73c089c4
--- /dev/null
+++ b/src/operators/depthwise_conv_op.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/depthwise_conv_op.h"
+#include <vector>
+#include "framework/data_type.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
+#include "operators/conv_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void DepthwiseConvOp<Dtype, T>::InferShape() const {
+  auto in_dims = param_.Input()->dims();
+  auto filter_dims = param_.Filter()->dims();
+  const std::vector<int> &strides = param_.Strides();
+  std::vector<int> paddings = param_.Paddings();
+  int groups = param_.Groups();
+  std::vector<int> dilations = param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  param_.Output()->Resize(ddim);
+}
+
+template class DepthwiseConvOp<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+USE_OP(depthwise_conv2d);
+REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c47fa0ffcacd54a5ddf7280419ca1170173bde1b
--- /dev/null
+++ b/src/operators/depthwise_conv_op.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/depthwise_conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> {
+ public:
+  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+                                                  scope),
+        param_(inputs, outputs, attrs, *scope) {}
+
+  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  void InferShape() const override;
+
+  void RunImpl() const {
+    operators::DepthwiseConvKernel<DeviceType, T> kernel;
+    kernel.Compute(param_);
+    this->ClearVariables({"Filter", "Input"});
+  }
+
+ private:
+  ConvParam param_;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index 1e2572b984734dcd88be7c1c750fc0f07448e66d..f04b8156c9d3c88520b1c74b60a20f41e7fedc98 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -17,19 +17,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
   LOG(kLOG_DEBUG) << param;
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1da52fa8d469bd81d043843d7bcca3a7b01f6663
--- /dev/null
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/depthwise_conv_kernel.h"
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  LOG(kLOG_DEBUG) << param;
+
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  //  DLOG << " compute end get Attrs " << strides[0];
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  //  DLOG << " col_shape = " << col_shape;
+  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  //  DLOG << " input_shape = " << input_shape;
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  //  DLOG << " filter.dims() = " << filter.dims();
+
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    //    DLOG << " in_batch.dims() = " << in_batch.dims();
+    //    DLOG << " out_batch.dims() = " << out_batch.dims();
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      //      DLOG << " out_slice " << out_slice.dims();
+      //      DLOG << " filter_slice " << filter_slice.dims();
+      //      DLOG << " col_matrix " << col_matrix.dims();
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+      auto filter_ptr = filter_slice.data<float>();
+    }
+  }
+}
+
+template class DepthwiseConvKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index a756e2d2417cc147cb0559f946a6a70085860ecb..d43a174ffdbf0ca6dbb39e463b8e97652c7b0daf 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
@@ -23,12 +24,28 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
+using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
-class ConvKernel : public framework::OpKernelBase<DeviceType, ConvParam> {
+class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
 };
+
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..43ddfb25cd859a7e937577221215d8352b846bff
--- /dev/null
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
+ public:
+  void Compute(const ConvParam &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index 6eaeb6e256148598b460f1fe4e1f0cdf451f186c..224382eb2b78b1653da0cbbd9327cabb4fd9b3d1 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -136,9 +136,15 @@ class SoftmaxFuntor<CPU, T> {
 
  public:
   void operator()(const framework::Tensor *X, framework::Tensor *Y) {
+    const DDim dDim = X->dims();
+    for (int i = 0; i < dDim[0]; ++i) {
+      framework::Tensor sub_X = X->Slice(i, i + 1);
+      framework::Tensor sub_Y = Y->Slice(i, i + 1);
+
 #if __ARM_NEON
-    SoftmaxCacl(X, Y);
+      SoftmaxCacl(&sub_X, &sub_Y);
 #endif
+    }
   }
 };
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f464c3bd94f92e8cbec1509c4e82df18658a7b1f..c71306281e3354cd1856ecaa7278266b031b665c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -84,10 +84,33 @@ target_link_libraries(test-gemm paddle-mobile)
 ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
 target_link_libraries(test-enforce paddle-mobile)
 
+# gen test
+ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-yolo paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
 target_link_libraries(test-googlenet paddle-mobile)
 
+# gen test
+ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-mobilenet paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-resnet paddle-mobile)
+# gen test
+ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-mobilenetssd paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+target_link_libraries(test-squeezenet paddle-mobile)
+
 # gen test
 ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
 target_link_libraries(test-sigmoid paddle-mobile)
+
+# gen test
+ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
+target_link_libraries(test-depthwise-conv-op paddle-mobile)
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 139579e9116651c15764997d962b7d2622532146..0640af890cf9857e1dda7eeaaec593b7c613fe8a 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -18,20 +18,17 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
   auto time1 = time();
   auto program = loader.Load(g_googlenet, false);
   auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
   paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
-
   std::vector<float> input;
   std::vector<int64_t> dims{1, 3, 224, 224};
   GetInput<float>(g_test_image_1x3x224x224, &input, dims);
   auto time3 = time();
   executor.Predict(input, dims);
   auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
   return 0;
 }
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9d92e7a51b9f7abe2c451df4073428bd2bd6d5f
--- /dev/null
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_mobilenet_ssd, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 300, 300};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ed9a3566e3be8d5baa7e47611fc713772e94327
--- /dev/null
+++ b/test/net/test_mobilenet.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_mobilenet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 2, false);
+
+  std::vector<int64_t> dims{2, 3, 224, 224};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {2, 3, 224, 224}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  auto vec_result = executor.Predict(input, dims);
+  float sum = 0;
+  for (const auto item : vec_result) {
+    sum += item;
+  }
+  DLOG << "mobilenet output sum =" << sum;
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55f4c5efef209c421fc550c1f17422acd64b11b9
--- /dev/null
+++ b/test/net/test_resnet.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto time1 = time();
+  auto program = loader.Load(g_resnet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 32, 32};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30460018fe8cc008e0031c1c713150745767fa28
--- /dev/null
+++ b/test/net/test_squeezenet.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  auto program = loader.Load(g_squeezenet, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 227, 227};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c82443e23953def917826fe4ec3b2c484b588f59
--- /dev/null
+++ b/test/net/test_yolo.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  auto program = loader.Load(g_yolo, false);
+  auto time2 = time();
+  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+
+  std::vector<int64_t> dims{1, 3, 227, 227};
+  Tensor input_tensor;
+  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  std::vector<float> input(input_tensor.data<float>(),
+                           input_tensor.data<float>() + input_tensor.numel());
+  auto time3 = time();
+  executor.Predict(input, dims);
+  auto time4 = time();
+  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  return 0;
+}
diff --git a/test/operators/test_depthwise_conv_op.cpp b/test/operators/test_depthwise_conv_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..648b4c5db9970804a2ca140eef13e2560e36f935
--- /dev/null
+++ b/test/operators/test_depthwise_conv_op.cpp
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/depthwise_conv_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet_ssd);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "depthwise_conv2d");
+
+  paddle_mobile::framework::LoDTensor input;
+  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  // use SetupTensor if not has local input image .
+  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
+  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
+                                 "depthwise_conv2d_0.tmp_0", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index dba4dec9bbc0a8066eef6c6dea9828dfb9954200..fc4ed6c91dc9b03c1f4dadfd8a4bc94efe3a724e 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -28,6 +28,7 @@ static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const std::string g_squeezenet = "../models/squeezenet";
 static const std::string g_resnet =
     "../models/image_classification_resnet.inference.model";
+static const std::string g_yolo = "../models/yolo";
 static const std::string g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
 using paddle_mobile::framework::DDim;