diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a33db73e109042276b686e8ab74261273df87390..f07b2eeb93daa827361acc97951483c21092135f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,6 +183,7 @@ upstream
 
 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
 
+
 ![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
 之后就可以提交代码了
 
@@ -222,6 +223,7 @@ upstream
      - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
      - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
    - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+
 3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
 
 此外，在回复评审人意见时，请您遵守以下约定：
diff --git a/doc/design_doc.md b/doc/design_doc.md
index bf5f78e8d805465418cad8989945f2afa7ab5587..3407c78443de0f0c7d9ebab848122c2e089e9e41 100644
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -3,6 +3,7 @@
 
 #### 以下是 paddle-mobile 代码的执行流程图:
 
+
 ![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
 
 
@@ -14,6 +15,7 @@
 先来看一下模型, 模型分为两种结构:
  一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
 
+
 ![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
 
 
@@ -21,7 +23,6 @@
 
 ![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
 
-
 loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
 方便进行算法优化.
 
diff --git a/doc/images/devices.png b/doc/images/devices.png
new file mode 100644
index 0000000000000000000000000000000000000000..413d32c249972ee96f678d50a5cd0b36a2a03e29
Binary files /dev/null and b/doc/images/devices.png differ
diff --git a/doc/images/flow_chart.png b/doc/images/flow_chart.png
new file mode 100644
index 0000000000000000000000000000000000000000..c747230da43e2e688d7460704268631758d34596
Binary files /dev/null and b/doc/images/flow_chart.png differ
diff --git a/doc/images/model_desc.png b/doc/images/model_desc.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c026b6192c8e1d84b3a82c3db91e022f35358c2
Binary files /dev/null and b/doc/images/model_desc.png differ
diff --git a/doc/images/model_desc_combined.png b/doc/images/model_desc_combined.png
new file mode 100644
index 0000000000000000000000000000000000000000..38e7388efcfdcad53f4e80ce0ac5d3b993eb986c
Binary files /dev/null and b/doc/images/model_desc_combined.png differ
diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
index e95bd8e76c5034f3897eff81e0ba67119d04a95b..1fd1c66d4dc92a9918243b23e400ef5309422050 100644
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -22,11 +22,11 @@ namespace operators {
 
 template <>
 bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
-  const Tensor *mean = (*param).InputMean();
-  const Tensor *variance = (*param).InputVariance();
-  const Tensor *scale = (*param).InputScale();
-  const Tensor *bias = (*param).InputBias();
-  const float epsilon = (*param).Epsilon();
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
 
   auto mean_ptr = mean->data<float>();
   auto variance_ptr = variance->data<float>();
@@ -47,8 +47,8 @@ bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
     new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
     new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
   }
-  (*param).SetNewScale(new_scale);
-  (*param).SetNewBias(new_bias);
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
   return true;
 }
 
diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
index bf96a2d46fd96516743127b71db57496e35b8a77..13fe50bf74ee164c2cc663f5a6a9eeddbfa3804b 100644
--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
@@ -17,11 +17,10 @@ limitations under the License. */
 #pragma once
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/op_param.h"
+
 namespace paddle_mobile {
 namespace operators {
-
-template <typename P>
-void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
+void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor bias = *param.Bias();
@@ -30,105 +29,122 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
   auto new_bias_ptr = new_bias.data<float>();
   auto new_scale_ptr = new_scale.data<float>();
   int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
   std::vector<int> paddings = param.Paddings();
   std::vector<int> dilations = param.Dilations();
-  Tensor *output = param.Output();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
   std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
 
-  if (filter_shape_vec[2] == 3 && strides[0] == 1 && groups > 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(input, filter, output, &bias, 1,
-                                        &new_scale, &new_bias, 1, 1);
-  } else {
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    math::expand_bias(bias, axis, output->dims());
-    output->ShareDataWith(bias);
-
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    bool is_expand =
-        math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col.mutable_data<float>(col_shape);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
 
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        output->dims()[1],
-        output->numel() / (output->dims()[0] * output->dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-    math::Vol2ColFunctor<CPU, float> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &col);
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<float>(filter_slice, false, col_matrix, false,
-                            static_cast<float>(1), &out_slice,
-                            static_cast<float>(1), false);
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
       }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
     }
-
-    auto output_ptr = output->data<float>();
-    for (int c = 0; c < output_matrix_shape[0]; c++) {
-      int start = c * output_matrix_shape[1];
-      for (int j = 0; j < output_matrix_shape[1]; j++) {
-        output_ptr[start + j] =
-            output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
-        output_ptr[start + j] =
-            output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
-      }
+  }
+  /// todo : use neon in special case instead of 2for(300ms)
+  auto output_ptr = output->data<float>();
+  for (int c = 0; c < output_matrix_shape[0]; c++) {
+    int start = c * output_matrix_shape[1];
+    for (int j = 0; j < output_matrix_shape[1]; j++) {
+      output_ptr[start + j] =
+          output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
+      output_ptr[start + j] =
+          output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
     }
   }
 }
+template <typename P>
+void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(
+        param.Input(), param.Filter(), param.Output(), &Bias, 1,
+        param.NewScale(), param.NewBias(), 1, 1);
+  } else if (0 && param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvAddBNReluBasic(param);
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
index d08eebe5493bd9026073c3349631a42024579b95..6accf1937da5343a33d9dd739c125836f080f181 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -15,19 +15,19 @@ limitations under the License. */
 #ifdef CONV_OP
 
 #pragma once
+#include <operators/math/depthwise_conv_3x3.h>
 #include <vector>
-#include "operators/math/conv_func.h"
+
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename P>
-void ConvCompute(const ConvParam &param) {
+inline void ConvBasic(const ConvParam &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
-  output->mutable_data<float>();
+
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
   std::vector<int> paddings = param.Paddings();
@@ -109,6 +109,27 @@ void ConvCompute(const ConvParam &param) {
   }
 }
 
+template <typename P>
+void ConvCompute(const ConvParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               &Bias, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvBasic(param);
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
index e43e3664cb005bab4d3c5ec8b5b35bd6925c982d..885f2051f645546c2585caa72aa9c80f8d352e6c 100644
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #ifdef DEPTHWISECONV_OP
 
 #pragma once
+#include <operators/math/depthwise_conv_3x3.h>
 #include <vector>
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -24,89 +26,21 @@ namespace operators {
 
 template <typename P>
 void DepthwiseConvCompute(const ConvParam &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  //  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               &Bias, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvBasic(param);
   }
 }
 
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index f74e365c7e087551e55363566d3dbd6ba530bfea..984678e8730ea58d7dc647450dd098d265f0eb39 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -275,33 +275,40 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       float w22 = filter_data_tmp[8];
 
       output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[l] + w22 * input_data[l + 1] +
-                       bias_data[j];
+                       w21 * input_data[l] + w22 * input_data[l + 1];
       output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
                            w20 * input_data[2 * l - 2] +
-                           w21 * input_data[2 * l - 1] + bias_data[j];
+                           w21 * input_data[2 * l - 1];
       output_data[(l - 1) * l] =
           w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
-          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
-          bias_data[j];
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
       output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
                                w01 * input_data[(l - 2) * (l + 1) + 1] +
                                w10 * input_data[l * l - 2] +
-                               w11 * input_data[l * l - 1] + bias_data[j];
+                               w11 * input_data[l * l - 1];
+      if (if_bias) {
+        output_data[0] += bias_data[j];
+        output_data[l - 1] += bias_data[j];
+        output_data[(l - 1) * l] += bias_data[j];
+        output_data[l * l - 1] += bias_data[j];
+      }
 
       for (int i = 1; i < l - 1; ++i) {
         output_data[i * l] =
             w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
             w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
-            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
-            bias_data[j];
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+
         output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
                                      w01 * input_data[i * l + l - 1 - l] +
                                      w10 * input_data[i * l + l - 1 - 1] +
                                      w11 * input_data[i * l + l - 1] +
                                      w20 * input_data[i * l + l - 1 + l - 1] +
-                                     w21 * input_data[i * l + l - 1 + l] +
-                                     bias_data[j];
+                                     w21 * input_data[i * l + l - 1 + l];
+        if (if_bias) {
+          output_data[i * l] += bias_data[j];
+          output_data[i * l + l - 1] += bias_data[j];
+        }
       }
 
       // top 1 row and bottom 1 row
@@ -502,12 +509,14 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
     }
   }
 }
-void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
+
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, Tensor *bias, bool if_bias,
-                                   Tensor *new_scale, Tensor *new_bias,
-                                   bool if_bn, bool if_relu) {
+                                   const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_bn,
+                                   bool if_relu) {
   const float *input_data = input->data<float>();
-  const float *filter_data = filter.data<float>();
+  const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
   const float *bias_data = bias->data<float>();
   const float *newscale_data = new_scale->data<float>();
@@ -547,29 +556,35 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
       float w21 = filter_data_tmp[7];
       float w22 = filter_data_tmp[8];
 
-      output_data[0] =
-          (w11 * input_data[0] + w12 * input_data[1] + w21 * input_data[l] +
-           w22 * input_data[l + 1] + bias_data[j]) *
-              newscale_data[j] +
-          newbias_data[j];
-      output_data[l - 1] = (w10 * input_data[l - 2] + w11 * input_data[l - 1] +
-                            w20 * input_data[2 * l - 2] +
-                            w21 * input_data[2 * l - 1] + bias_data[j]) *
-                               newscale_data[j] +
-                           newbias_data[j];
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1];
+
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1];
 
       output_data[(l - 1) * l] =
-          (w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
-           w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
-           bias_data[j]) *
-              newscale_data[j] +
-          newbias_data[j];
-      output_data[l * l - 1] = (w00 * input_data[(l - 2) * (l + 1)] +
-                                w01 * input_data[(l - 2) * (l + 1) + 1] +
-                                w10 * input_data[l * l - 2] +
-                                w11 * input_data[l * l - 1] + bias_data[j]) *
-                                   newscale_data[j] +
-                               newbias_data[j];
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1];
+      if (if_bias) {
+        output_data[0] += bias_data[j];
+        output_data[l - 1] += bias_data[j];
+        output_data[(l - 1) * l] += bias_data[j];
+        output_data[l * l - 1] += bias_data[j];
+      }
+      if (if_bn) {
+        output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
+        output_data[l - 1] =
+            output_data[l - 1] * newscale_data[j] + newbias_data[j];
+        output_data[(l - 1) * l] =
+            output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
+        output_data[l * l - 1] =
+            output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+      }
       if (if_relu) {
         output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
         output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
@@ -580,21 +595,25 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
       }
       for (int i = 1; i < l - 1; ++i) {
         output_data[i * l] =
-            (w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
-             w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
-             w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
-             bias_data[j]) *
-                newscale_data[j] +
-            newbias_data[j];
-        output_data[i * l + l - 1] =
-            (w00 * input_data[i * l + l - 1 - l - 1] +
-             w01 * input_data[i * l + l - 1 - l] +
-             w10 * input_data[i * l + l - 1 - 1] +
-             w11 * input_data[i * l + l - 1] +
-             w20 * input_data[i * l + l - 1 + l - 1] +
-             w21 * input_data[i * l + l - 1 + l] + bias_data[j]) *
-                newscale_data[j] +
-            newbias_data[j];
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l];
+        if (if_bias) {
+          output_data[i * l] += bias_data[j];
+          output_data[i * l + l - 1] += bias_data[j];
+        }
+        if (if_bn) {
+          output_data[i * l] =
+              output_data[i * l] * newscale_data[j] + newbias_data[j];
+          output_data[i * l + l - 1] =
+              output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
+        }
         if (if_relu) {
           output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
           output_data[i * l + l - 1] =
diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h
index 44299295eebad6a90fd994cf74589c09a3573aee..a0beb479926902a71b7e06128aa8cecdd5443196 100644
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -32,10 +32,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                       Tensor *output, bool if_bias);
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                           Tensor *output, Tensor *bias, bool if_bias);
-void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, Tensor *bias, bool if_bias,
-                                   Tensor *new_scale, Tensor *new_bias,
-                                   bool if_bn, bool if_relu);
+                                   const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_bn,
+                                   bool if_relu);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile