diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h
index 98af2ca6053fe544b49df4510b74ad0ac505b009..e72c55f5f736b81362f461952a706127998f9ade 100644
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
 LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_relu);
 #endif
-#ifdef FUSION_CONVADDADDPRELU_OP
-LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
-#endif
 #ifdef FUSION_CONVADD_OP
 LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
 LOAD_FUSION_MATCHER(fusion_conv_add);
@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
 #ifdef DROPOUT_OP
 LOAD_OP2(dropout, CPU, FPGA);
 #endif
-#ifdef FUSION_CONVADDPRELU_OP
-LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
-#endif
 #ifdef FUSION_DWCONVBNRELU_OP
 LOAD_OP1(fusion_dwconv_bn_relu, CPU);
 LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 2c70f42f56530c2d21252d6b51c228e7c49ca8bf..ad778b1fef7fe400e1df645703cf3ebfb1b22727 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index 2e7f193c5c9f66668411bb115da9d3cd980f8a6b..0e74654e1f661d55a263f9f9b57a1ba2a32dfd74 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_conv_add_add_prelu_op.cpp b/src/operators/fusion_conv_add_add_prelu_op.cpp
deleted file mode 100644
index 2f3d29dc74ed3a852b5c41a64d46b8710ebec599..0000000000000000000000000000000000000000
--- a/src/operators/fusion_conv_add_add_prelu_op.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#include "operators/fusion_conv_add_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
-                        ops::FusionConvAddAddPReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-
-#endif  // FUSION_CONVADDADDPRELU_OP
diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h
deleted file mode 100644
index 8ecb4aa715e34a2a9e67abf097ab5622a6dacf19..0000000000000000000000000000000000000000
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_add_prelu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddAddPReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD,
-                   {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
-
-                 removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
-
-  std::vector<std::pair<int, std::string>> NeedCheck() {
-    DLOG << " conv add add prelu check add X ";
-    return {{2, "Y"}, {2, "X"}};
-  }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddAddPReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-          operators::ConvAddAddPReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-            operators::ConvAddAddPReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/fusion_conv_add_bn_op.cpp b/src/operators/fusion_conv_add_bn_op.cpp
index e8daba7e9ba209cf078323ea79dd6f6a9b6e8200..27e3c04d62c29abe69adef7457bc633d294e2cdc 100644
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP
 
 #include "operators/fusion_conv_add_bn_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
index b9bc948fe0e77741a36f959e29eb2a4c82e82b72..8f162a2d29de32340b8f7f3fe3094a230212929d 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 
 #include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_conv_add_op.cpp b/src/operators/fusion_conv_add_op.cpp
index 731bb631bb98490d580e0c6fe28c24312f6ccb57..49cf29c38e40f5a55fa0546e988d2860a6842f6b 100644
--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 
 #include "operators/fusion_conv_add_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_conv_add_prelu_op.cpp b/src/operators/fusion_conv_add_prelu_op.cpp
deleted file mode 100644
index 9273af388c2c0a8644b29e1f40a5238b0e092523..0000000000000000000000000000000000000000
--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDPRELU_OP
-
-#include "operators/fusion_conv_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
-                        ops::FusionConvAddPReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-
-#endif
diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h
deleted file mode 100644
index 25aab72634da52c1a344d39b3f041531065efc5b..0000000000000000000000000000000000000000
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDPRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_prelu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddPReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
-                 removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddPReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddPReluParam<DeviceType>,
-          operators::ConvAddPReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddPReluParam<DeviceType>,
-            operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index bb4b6666a881de0989d43840806b9d5d720b3b66..163dfba3cc8706dac96697974ef7224b3f625ae1 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP
 
 #include "operators/fusion_conv_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
diff --git a/src/operators/fusion_conv_bn_add_relu_op.cpp b/src/operators/fusion_conv_bn_add_relu_op.cpp
index 9a3926353319aa267814097d93a6d9b1fa20bd2d..c2bb2c744d5599558f14e2f1d169b00a1492e135 100644
--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNADDRELU_OP
 
 #include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_conv_bn_op.cpp b/src/operators/fusion_conv_bn_op.cpp
index 7786cd713b5f838e22aa3080697d551609d81036..4939123a77a072ea410bfa96547b8a0ed276c28d 100644
--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP
 
 #include "operators/fusion_conv_bn_op.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp
index 54c9f85cbb7dc00bd0df5747caac8fd2ee9e2782..0e8eec65f2e46e1314c11b7f6bceade861445ef6 100644
--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 
 #include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp
index f5040987e42f9c0b3068d730a9926b9fcff8c8c3..d4c04f67fc637266cf95af2e7fe518682e212d98 100644
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_DWCONVBNRELU_OP
 
 #include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
diff --git a/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
deleted file mode 100644
index 2f6f5f3ac719b3fd32aac54ce36eb534f7d99dd7..0000000000000000000000000000000000000000
--- a/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#include "operators/kernel/conv_add_add_prelu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddAddPReluKernel<CPU, float>::Init(
-    FusionConvAddAddPReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvAddAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddAddPReluParam<CPU> &param) {
-  ConvAddAddPReluCompute<float>(param);
-}
-template class ConvAddAddPReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
index ae67147ffdc4418b263458be16ecb9d33a89d8a0..f9489330ca33a7b055ff91e9c8e259d1feb1e827 100644
--- a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -62,34 +63,24 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
     const FusionConvAddBNReluParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
+      DepthwiseConv3x3<float, float>(param);
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
       WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionConvAddBNReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
       break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
   }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 
 template class ConvAddBNReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
index 76c2200df3b6607cad51d2430e7b5a2cda41c6cd..5a44b083a37b19637c053655e23196385d432971 100644
--- a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "operators/kernel/conv_add_kernel.h"
 #include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_add_arm_func.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -32,34 +32,25 @@ template <>
 void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
       math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
                                              param.Paddings(), param.Output());
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
       break;
     case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
       WinogradConv3x3<8, 3>(param);
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
       break;
     case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvAddBasic(param);
+      GemmConv<float, float>(param);
       break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
   }
+  math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(), param.Output());
 }
 
 template class ConvAddKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
deleted file mode 100644
index f04a9a7d746f2d970196945707bd05409c5fa340..0000000000000000000000000000000000000000
--- a/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDPRELU_OP
-
-#include "operators/kernel/conv_add_prelu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddPReluParam<CPU> &param) {
-  ConvAddPReluCompute<float>(param);
-}
-template class ConvAddPReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
index e0387f6444bceb3cee8d9be34530fe2d81d56af5..a9efae96e94afa24b48ed46214ff1fdd8ec50d83 100644
--- a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -32,30 +33,23 @@ void ConvAddReluKernel<CPU, float>::Compute(
     const FusionConvAddReluParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
-      break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
+      DepthwiseConv3x3<float, float>(param);
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
       WinogradConv3x3<8, 3>(param);
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvAddReluBasic<FusionConvAddReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
       break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
   }
+  math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
 }
 
 template class ConvAddReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
index f591833887960adc37fe741d8a36946474ffcf8e..26e0e343675d8e90e23d38084537bac99793142b 100644
--- a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -62,34 +63,24 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
     const FusionConvBNAddReluParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
+      DepthwiseConv3x3<float, float>(param);
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
       WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionConvBNAddReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
       break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
   }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 template class ConvBNAddReluKernel<CPU, float>;
 
diff --git a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
index 352df8a38996900dc53f0483c2ff28133444b066..15129d72895a89a4cba918d7a8da747a17962f58 100644
--- a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -61,34 +62,24 @@ void ConvBNReluKernel<CPU, float>::Compute(
     const FusionConvBNReluParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
+      DepthwiseConv3x3<float, float>(param);
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
       WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionConvBNReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
       break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
   }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 template class ConvBNReluKernel<CPU, float>;
 
diff --git a/src/operators/kernel/arm/convolution/conv_kernel.cpp b/src/operators/kernel/arm/convolution/conv_kernel.cpp
index 6771b88d4b981881232c73e2821f97044a008148..1c6ac2015daeab4f7ffe8a3e178222a6f4c4c4e8 100644
--- a/src/operators/kernel/arm/convolution/conv_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_kernel.cpp
@@ -32,10 +32,10 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
   switch (param.ExecMode()) {
+#ifndef __aarch64__
     case ConvParam<CPU>::EXEC_GEMM_INT8:
       GemmConv<int8_t, int32_t>(param);
       break;
-#ifndef __aarch64__
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8:
       DepthwiseConv3x3<int8_t, int32_t>(param);
       break;
@@ -44,12 +44,8 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
       break;
 #endif  // __aarch64__
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
+      DepthwiseConv3x3<float, float>(param);
       break;
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
diff --git a/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
index 9b5f87b1a776b741be17cbca8cae3b15259a7253..748845e23e9290f9e40f7d63137be3b811bafebd 100644
--- a/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -61,37 +62,28 @@ void DWConvBNReluKernel<CPU, float>::Compute(
     const FusionDWConvBNReluParam<CPU> &param) {
   switch (param.ExecMode()) {
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
     case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
+      DepthwiseConv3x3<float, float>(param);
       break;
 #ifndef __aarch64__
     case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
       DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
     case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
       WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
       break;
 #endif  // __aarch64__
     case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionDWConvBNReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
       break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
   }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
+
 template class DWConvBNReluKernel<CPU, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
deleted file mode 100644
index 4c9ca6e3e8ef995e9cce6f565aafece17ac51b10..0000000000000000000000000000000000000000
--- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-#pragma once
-
-#include <string>
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor bias1 = *param.Bias1();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  Tensor aa = *param.InputAlpha();
-  float *p = aa.data<float>();
-
-  std::string mode = param.Mode();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    Tensor bias1_batch = bias1.Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
-      float *biase_data1 = bias1_slice.data<float>();
-      math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
-                            p, mode, biase_data, biase_data1);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_CONVADDADDPRELU_OP
diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
deleted file mode 100644
index 24b35229b31151348475a293611cb4402999b3b7..0000000000000000000000000000000000000000
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMul<float, float>(filter_slice, false, col_matrix, false,
-                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(1), false, biase_data);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
deleted file mode 100644
index d11a8442acdd275c95aaa96b2c3e1855e44746e9..0000000000000000000000000000000000000000
--- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDPRELU_OP
-#pragma once
-
-#include <string>
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  Tensor aa = *param.InputAlpha();
-  float *p = aa.data<float>();
-  std::string mode = param.Mode();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
-                            p, mode, biase_data, nullptr);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_CONVADDPRELU_OP
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..495963d470457513dd70489bb04f4de46ffdedcc
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -0,0 +1,242 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include <vector>
+#include "operators/math/depthwise_conv3x3.h"
+#include "operators/math/depthwise_conv5x5.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/pad.h"
+#include "operators/math/vol2col.h"
+#include "operators/math/winograd/winograd_transform.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
+                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
+bool IsExpand(const std::vector<int64_t> &filter_dim,
+              const std::vector<int> &strides, const std::vector<int> &paddings,
+              const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+template <typename Itype, typename Otype>
+void GemmConv(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<Otype>();
+
+  int groups = param.Groups();
+  const std::vector<int> strides = param.Strides();
+  const std::vector<int> paddings = param.Paddings();
+  const std::vector<int> dilations = param.Dilations();
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<Itype>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, Itype> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        // col_matrix.ShareDataWith(in_slice);
+        col_matrix = in_slice;
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
+                                 static_cast<float>(1), &out_slice,
+                                 static_cast<float>(0), false,
+                                 static_cast<Otype *>(nullptr));
+    }
+  }
+}
+
+template <int tile, int kernel>
+void WinogradConv3x3(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  const Tensor *filter = param.transformed_filter_;
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int batch_size = input->dims()[0];
+  int groups = param.Groups();
+  const std::vector<int> &paddings = param.Paddings();
+
+  auto winograd_pad = [&](int width, int pad) {
+    int output_tile = tile - kernel + 1;
+    // int tiles = (width + pad - kernel) / output_tile + 1;
+    // return (tiles - 1) * output_tile + tile - width;
+    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
+    return pad_width + tile - width;
+  };
+
+  math::PadFunctor<CPU, float> pad;
+  Tensor input_pad;
+  framework::Tensor transformed_input;
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor in_batch = input->Slice(i, i + 1);
+    Tensor out_batch = output->Slice(i, i + 1);
+    // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
+    // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
+    int pad_bottom = paddings[0];
+    int pad_right = paddings[1];
+    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
+      framework::DDim pad_shape = in_batch.dims();
+      pad_shape[2] += paddings[0] + pad_bottom;
+      pad_shape[3] += paddings[1] + pad_right;
+      input_pad.mutable_data<float>(pad_shape);
+      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
+          &input_pad);
+    } else {
+      input_pad = in_batch;
+    }
+    // tile input and transform
+    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
+    // caculate output
+    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
+                                                  output);
+  }
+}
+
+template <typename Itype, typename Otype>
+void DepthwiseConv3x3(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  const Tensor *filter = param.Filter();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &strides = param.Strides();
+  const int batch_size = input->dims()[0];
+  Tensor *output = param.Output();
+  output->mutable_data<Otype>();
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1);
+    Tensor out_batch = output->Slice(i, i + 1);
+    if (strides[0] == 1) {
+      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
+                                             &out_batch);
+    } else if (strides[0] == 2) {
+      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
+                                             &out_batch);
+    } else {
+      GemmConv<Itype, Otype>(param);
+    }
+  }
+}
+
+template <typename Itype, typename Otype>
+void DepthwiseConv5x5(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  const Tensor *filter = param.Filter();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &strides = param.Strides();
+  const int batch_size = input->dims()[0];
+  Tensor *output = param.Output();
+  output->mutable_data<Otype>();
+
+  //  if (strides[0] == 1) {
+  //    for (int i = 0; i < batch_size; i++) {
+  //      Tensor in_batch = input->Slice(i, i + 1);
+  //      Tensor out_batch = output->Slice(i, i + 1);
+  //      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
+  //                                             &out_batch);
+  //    }
+  //  } else {
+  GemmConv<Itype, Otype>(param);
+  //  }
+}
+
+template void GemmConv<float, float>(const ConvParam<CPU> &param);
+template void WinogradConv3x3<8, 3>(const ConvParam<CPU> &param);
+template void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param);
+template void DepthwiseConv5x5<float, float>(const ConvParam<CPU> &param);
+
+#ifndef __aarch64__
+template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
+template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
+template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
index b527c0bad9ad295d76d11be683a34492d4b0d5d9..52bcbbb7c6f76e7e68da4c8a10271bb1bac35adf 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -15,386 +15,31 @@ limitations under the License. */
 #ifdef CONV_OP
 
 #pragma once
+
 #include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/depthwise_conv5x5.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/pad.h"
-#include "operators/math/vol2col.h"
-#include "operators/math/winograd/winograd_transform.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename Itype, typename Otype>
-inline void GemmConv(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  int groups = param.Groups();
-  const std::vector<int> strides = param.Strides();
-  const std::vector<int> paddings = param.Paddings();
-  const std::vector<int> dilations = param.Dilations();
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<Itype>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        // col_matrix.ShareDataWith(in_slice);
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
+int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
+                   int stride);
 
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+bool IsExpand(const std::vector<int64_t> &filter_dim,
+              const std::vector<int> &strides, const std::vector<int> &paddings,
+              const std::vector<int> &dilations);
 
-      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
-                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(0), false,
-                                 static_cast<Otype *>(nullptr));
-    }
-  }
-}
+template <typename Itype, typename Otype>
+void GemmConv(const ConvParam<CPU> &param);
 
 template <int tile, int kernel>
-inline void WinogradConv3x3(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.transformed_filter_;
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int batch_size = input->dims()[0];
-  int groups = param.Groups();
-  const std::vector<int> &paddings = param.Paddings();
-
-  auto winograd_pad = [&](int width, int pad) {
-    int output_tile = tile - kernel + 1;
-    // int tiles = (width + pad - kernel) / output_tile + 1;
-    // return (tiles - 1) * output_tile + tile - width;
-    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
-    return pad_width + tile - width;
-  };
+void WinogradConv3x3(const ConvParam<CPU> &param);
 
-  math::PadFunctor<CPU, float> pad;
-  Tensor input_pad;
-  framework::Tensor transformed_input;
-  for (int i = 0; i < batch_size; ++i) {
-    Tensor in_batch = input->Slice(i, i + 1);
-    Tensor out_batch = output->Slice(i, i + 1);
-    // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
-    // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
-    int pad_bottom = paddings[0];
-    int pad_right = paddings[1];
-    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
-      framework::DDim pad_shape = in_batch.dims();
-      pad_shape[2] += paddings[0] + pad_bottom;
-      pad_shape[3] += paddings[1] + pad_right;
-      input_pad.mutable_data<float>(pad_shape);
-      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
-          &input_pad);
-    } else {
-      input_pad = in_batch;
-    }
-    // tile input and transform
-    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
-    // caculate output
-    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
-                                                  output);
-  }
-}
-
-#ifndef __aarch64__
-// int8 DepthwiseConv3x3
 template <typename Itype, typename Otype>
-inline void DepthwiseConv3x3(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1);
-    Tensor out_batch = output->Slice(i, i + 1);
-    if (strides[0] == 1) {
-      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    } else if (strides[0] == 2) {
-      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    } else {
-      GemmConv<Itype, Otype>(param);
-    }
-  }
-}
-#endif  // __aarch64__
+void DepthwiseConv3x3(const ConvParam<CPU> &param);
 
 template <typename Itype, typename Otype>
-inline void DepthwiseConv5x5(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  //  if (strides[0] == 1) {
-  //    for (int i = 0; i < batch_size; i++) {
-  //      Tensor in_batch = input->Slice(i, i + 1);
-  //      Tensor out_batch = output->Slice(i, i + 1);
-  //      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
-  //                                             &out_batch);
-  //    }
-  //  } else {
-  GemmConv<Itype, Otype>(param);
-  //  }
-}
-
-template <typename ParamType>
-void ConvAddReluBasic(const ParamType &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  float alpha = 1.0f;
-  float beta = 1.0f;
-  int32_t groups = param.Groups();
-  int32_t axis = param.Axis();
-  std::vector<int32_t> strides = param.Strides();
-  std::vector<int32_t> paddings = param.Paddings();
-  std::vector<int32_t> dilations = param.Dilations();
-
-  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
-  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-
-  float *bias_data = bias.data<float>();
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int32_t i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int32_t g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<float, float>(filter_slice, false, col_matrix, false, alpha,
-                                 &out_slice, beta, true, bias_data);
-    }
-  }
-}
-
-template <typename ParamType>
-void ConvBNReluBasic(const ParamType &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
+void DepthwiseConv5x5(const ConvParam<CPU> &param);
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
deleted file mode 100644
index 9e32d20291a7aa364eb003225de7a6d9ff45d03e..0000000000000000000000000000000000000000
--- a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *bias1 = param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(1), true, &new_scale, &new_bias, g,
-                         bias_data.data<float>());
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h
deleted file mode 100644
index fadaf7564ceeb7a52215dc335135016be02bc1ab..0000000000000000000000000000000000000000
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddAddPReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param);
-  bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h
index 7a921ecc7d0f4498cae80fbb9cea1b13e4c94101..757664eb536f871811964608c8ad709c416d126c 100644
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h
index 3f088528fc901987873038c7e1dd779dcc2019e7..919c66106eda1159f14c40e768325f1f5dcf5ff6 100644
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h
index 140d0475a8ee2f017a7c587c38429ccbb2edd387..fd3f279a7829a5803da6e08c0280435443425ad0 100644
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "common/common.h"
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h
deleted file mode 100644
index 631982789b09c57d0d21186d0a30df7368d2955f..0000000000000000000000000000000000000000
--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDPRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddPReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddPReluParam<DeviceType> &param);
-  bool Init(FusionConvAddPReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h
index e001926b361da96ec3ff76e120bc3d1ad13714fa..8cfc92ef19937650f1835e16eb26c1bf59f2d345 100644
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/conv_bn_add_relu_kernel.h b/src/operators/kernel/conv_bn_add_relu_kernel.h
index dcd8fecf07fbb4ea75b382f5315e24e64e26e939..63a86b56538a259b783a6a99536b6c5be15d915a 100644
--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/conv_bn_kernel.h b/src/operators/kernel/conv_bn_kernel.h
index e669f3bdd85dbd89e3a48d417dcd0cd6b9706062..1fb0d680cf4584e2433af254cca25bc52a3b9e03 100644
--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h
index 91b3413116ae22a8e212cf149c4e0c2a8924664a..f63b61ab09f90c8c40738cbe94ec6ebcff9420ff 100644
--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h
index f2e4c0afbd0aaafff5339816764f9e30592f122c..3bd8093adb539d8fc0f6ea4b400b9ff864e1b664 100644
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h
index 99dbfe2d658cde17e6399f8ea4bc5b945092cde5..486c828acab6d24741baae5804f09bc3e850b02f 100644
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -15,24 +15,21 @@ limitations under the License. */
 #pragma once
 
 #ifdef LRN_OP
+
+#include <cmath>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#include <cmath>
-
 #ifdef __ARM_NEON
-#include "arm_neon.h"
-#include "operators/math/math_func_neon.h"
+#include <arm_neon.h>
+#include "operators/math/math.h"
 #endif
+#include "framework/operator.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename T>
 struct LRNFunctor {
   void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
diff --git a/src/operators/math/activation.h b/src/operators/math/activation.h
index 90b9ab4c3a558a994370ea80693e1d31687bb44e..fb90a35516d8c461a05328f65bce24a2b8aa519f 100644
--- a/src/operators/math/activation.h
+++ b/src/operators/math/activation.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "common/types.h"
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
-#include "operators/math/math_func_neon.h"
+#include "operators/math/math.h"
 #endif
 
 namespace paddle_mobile {
diff --git a/src/operators/math/conv_func.h b/src/operators/math/channel_wise.h
similarity index 65%
rename from src/operators/math/conv_func.h
rename to src/operators/math/channel_wise.h
index 4debd2e105856905b13bee55c455e9d263d26fe9..796ea6d2b97d31d3091b225601065ee4670316e8 100644
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/channel_wise.h
@@ -14,91 +14,16 @@ limitations under the License. */
 
 #pragma once
 
-#include <vector>
+#include "framework/tensor.h"
+#include "operators/math/activation.h"
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#include "framework/ddim.h"
-#include "framework/tensor.h"
-#include "operators/math/activation.h"
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-using framework::DDim;
-using framework::Tensor;
-
-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
-inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {  // NOLINT
-  const auto bias_ptr = bias.data<float>();
-  const DDim bias_ddim = bias.dims();
-  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
-                        "the bias tensor's dims size != 1")
-  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
-  int outer_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-  bias.Resize(dDim);
-  auto new_ptr = bias.mutable_data<float>();
-  int axis_size = dDim[axis];
-
-#ifdef __ARM_NEON
-  for (int i = 0; i < outer_size; ++i) {
-    int inner_num = inner_size >> 4;
-    int remain = inner_size - (inner_num << 4);
-    float v_bias = bias_ptr[i * axis_size / outer_size];
-    for (; inner_num > 0; inner_num--) {
-      float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
-      vst1q_f32(new_ptr, v_newptr1);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr2);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr3);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr4);
-      new_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *new_ptr = v_bias;
-      new_ptr++;
-    }
-  }
-#else
-  for (int i = 0; i < outer_size; ++i) {
-    float v_bias = bias_ptr[i * axis_size / outer_size];
-    for (int j = 0; j < inner_size; ++j) {
-      new_ptr[i * inner_size + j] = v_bias;
-    }
-  }
-#endif
-}
-
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 template <ActivationType Act>
 void AddChannelWise(const framework::Tensor *input,
                     const framework::Tensor *bias, framework::Tensor *output) {
diff --git a/src/operators/math/depthwise_conv3x3.h b/src/operators/math/depthwise_conv3x3.h
index 9b9c5c0a6d13dece6b3d8462de6a0c2630688cdf..1f145c4f94bf2061fb9db74aec84684387809854 100644
--- a/src/operators/math/depthwise_conv3x3.h
+++ b/src/operators/math/depthwise_conv3x3.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"
-#include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/math/depthwise_conv3x3_int8.cpp b/src/operators/math/depthwise_conv3x3_int8.cpp
index 91e682c14590a10fc393aaefb5d37c015065fc0a..76262c76fb4e6687340985c75d08b688dded1cff 100644
--- a/src/operators/math/depthwise_conv3x3_int8.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(__ARM_NEON__) && !defined(__aarch64__)
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 
 #include <arm_neon.h>
 #include "operators/math/depthwise_conv3x3.h"
@@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
   DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
   // middle
   int remain_start = valid_w_start;
-#ifdef __ARM_NEON__
   int output_tiles = (valid_w_end - valid_w_start) / 6;
   remain_start = valid_w_start + output_tiles * 6;
   int32x4_t _sum0, _sum1;
@@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
     vst1q_s32(output_ptr + output_offset, _sum0);
     vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1));
   }
-#endif  // __ARM_NEON__
   for (int w = remain_start; w < valid_w_end; ++w) {
     int32_t value = 0;
     int input_start = -padding_w + w * Stride_w;
@@ -215,6 +213,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
         output_ptr2 += valid_w_start;
         output_ptr3 += valid_w_start;
       }
+#if __aarch64__
+#else
       // valid
       int loop = output_w_tiles;
       asm volatile(
@@ -525,6 +525,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
           : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
           : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
             "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
       // pad right
       if (padding_w) {
         int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
@@ -618,7 +619,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
         output_ptr0 += valid_w_start;
         output_ptr1 += valid_w_start;
       }
-      // valid
+        // valid
+#if __aarch64__
+#else
       int loop = output_w_tiles;
       asm volatile(
           "cmp        %[loop], #0                  \n"
@@ -804,6 +807,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
           : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
           : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
             "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
       // pad right
       if (padding_w) {
         int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
@@ -869,7 +873,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
         }
         output_ptr0 += valid_w_start;
       }
-      // valid
+        // valid
+#if __aarch64__
+#else
       int loop = output_w_tiles;
       asm volatile(
           "cmp        %[loop], #0                  \n"
@@ -993,6 +999,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
           : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
           : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
             "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
       // pad right
       if (padding_w) {
         int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
@@ -1152,7 +1159,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
         output_ptr1 += valid_w_start;
         output_ptr2 += valid_w_start;
       }
-      // valid
+        // valid
+#if __aarch64__
+#else
       int loop = output_w_tiles;
       asm volatile(
           "cmp        %[loop], #0                     \n"
@@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
           : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
           : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
             "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
       // pad right
       if (padding_w > 0) {
         int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
@@ -1490,7 +1500,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
         input_ptr2 += valid_input_w_start;
         output_ptr0 += valid_w_start;
       }
-      // valid
+        // valid
+#if __aarch64__
+#else
       int loop = output_w_tiles;
       asm volatile(
           "cmp        %[loop], #0                      \n"
@@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
           : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
           : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
             "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
       // pad right
       if (padding_w > 0) {
         int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
@@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
 }  // namespace operators
 }  // namespace paddle_mobile
 
-#endif
+#endif  // __ARM_NEON__
diff --git a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
deleted file mode 100644
index e2c01838442b01dee10cd8d85126429277d8c672..0000000000000000000000000000000000000000
--- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) && defined(__aarch64__)
-
-#include "operators/math/depthwise_conv3x3.h"
-#ifdef __ARM_NEON__
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// template<>
-// void DepthwiseConv3x3<int8_t, int32_t>(
-//     const framework::Tensor *input, const framework::Tensor *filter,
-//     const std::vector<int> &strides, framework::Tensor *output) {
-//   PADDLE_MOBILE_THROW_EXCEPTION(
-//       "Depthwise conv with generic strides has not been implemented.");
-// }
-
-template <>
-void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Depthwise conv3x3 with stride 1 for arm v8 has not been implemented.");
-}
-
-template <>
-void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Depthwise conv3x3 with stride 2 for arm v8 has not been implemented.");
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/math/depthwise_conv5x5.h b/src/operators/math/depthwise_conv5x5.h
index d047bbfa1ac179e0ef0b1b6705e349890b25e800..11d96b078ac7314ef0f3de98614c1e4ebd4dbc95 100644
--- a/src/operators/math/depthwise_conv5x5.h
+++ b/src/operators/math/depthwise_conv5x5.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"
-#include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/math/gemm/pack_kernel.h b/src/operators/math/gemm/pack_kernel.h
index 31342ec1b7d504869c16a6a2a9a0f1491be4a3c3..598bf3248d2cb56c3324efa6858b3d045e7a2a3c 100644
--- a/src/operators/math/gemm/pack_kernel.h
+++ b/src/operators/math/gemm/pack_kernel.h
@@ -31,345 +31,239 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
 
 void pack_lhs_6r(const int m, const int k, const float *A, const int lda,
                  float *output, const bool unroll) {
-  float *zero = new float[k];
-  memset(zero, 0, k * sizeof(float));
+  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
+  int remain_k = k & 0x3;
+  uint32x4_t vzero = vdupq_n_u32(0);
+  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
 
-  const int m_tail = m % 6;
-  const int i_length = m - m_tail;
-  for (int i = 0; i < i_length; i += 6) {
+  #pragma omp parallel for if (unroll)
+  for (int i = 0; i < m - 5; i += 6) {
     const float *a0 = A + i * lda;
     const float *a1 = A + (i + 1) * lda;
     const float *a2 = A + (i + 2) * lda;
     const float *a3 = A + (i + 3) * lda;
     const float *a4 = A + (i + 4) * lda;
     const float *a5 = A + (i + 5) * lda;
-    float *local_buffer = output + i * k;
-    for (int j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-      *local_buffer++ = *a4++;
-      *local_buffer++ = *a5++;
+    float *out_ptr = output + i * k;
+
+    int loops = k >> 2;
+    if (loops > 0) {
+#if __aarch64__
+      for (int l = 0; l < loops; ++l) {
+        float32x4_t _d0 = vld1q_f32(a0);
+        float32x4_t _d1 = vld1q_f32(a1);
+        float32x4_t _d2 = vld1q_f32(a2);
+        float32x4_t _d3 = vld1q_f32(a3);
+        float32x4_t _d4 = vld1q_f32(a4);
+        float32x4_t _d5 = vld1q_f32(a5);
+
+        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
+        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
+        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
+        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
+        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
+        _d2 =
+            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
+        _d3 =
+            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
+
+        vst1q_f32(out_ptr, _d0);
+        vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
+        vst1q_f32(out_ptr + 6, _d1);
+        vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
+        vst1q_f32(out_ptr + 12, _d2);
+        vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
+        vst1q_f32(out_ptr + 18, _d3);
+        vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
+
+        a0 += 4;
+        a1 += 4;
+        a2 += 4;
+        a3 += 4;
+        a4 += 4;
+        a5 += 4;
+        out_ptr += 24;
+      }
+#else
+      asm volatile(
+          "loop_4k_%=:                        \n"
+          "vld1.32    {d0-d1}, [%[a0]]!       \n"
+          "vld1.32    {d2-d3}, [%[a1]]!       \n"
+          "vld1.32    {d4-d5}, [%[a2]]!       \n"
+          "vld1.32    {d6-d7}, [%[a3]]!       \n"
+          "vld1.32    {d8-d9}, [%[a4]]!       \n"
+          "vld1.32    {d10-d11}, [%[a5]]!     \n"
+          "vtrn.32    q0, q1                  \n"
+          "vtrn.32    q2, q3                  \n"
+          "vtrn.32    q4, q5                  \n"
+          "vswp.32    d1, d4                  \n"
+          "vswp.32    d3, d6                  \n"
+
+          "vst1.32    {q0}, [%[out]]!         \n"
+          "vst1.32    {d8}, [%[out]]!         \n"
+          "vst1.32    {q1}, [%[out]]!         \n"
+          "vst1.32    {d10}, [%[out]]!        \n"
+          "vst1.32    {q2}, [%[out]]!         \n"
+          "vst1.32    {d9}, [%[out]]!         \n"
+          "vst1.32    {q3}, [%[out]]!         \n"
+          "vst1.32    {d11}, [%[out]]!        \n"
+
+          "subs       %[loops], #1            \n"
+          "bne        loop_4k_%=              \n"
+          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
+            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
+          :
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+#endif
+    }
+
+    if (remain_k > 0) {
+      float32x4_t _d0 = vld1q_f32(a0);
+      float32x4_t _d1 = vld1q_f32(a1);
+      float32x4_t _d2 = vld1q_f32(a2);
+      float32x4_t _d3 = vld1q_f32(a3);
+      float32x4_t _d4 = vld1q_f32(a4);
+      float32x4_t _d5 = vld1q_f32(a5);
+
+      _d0 = vandq_f32_u32(_d0, vmask1);
+      _d1 = vandq_f32_u32(_d1, vmask1);
+      _d2 = vandq_f32_u32(_d2, vmask1);
+      _d3 = vandq_f32_u32(_d3, vmask1);
+      _d4 = vandq_f32_u32(_d4, vmask1);
+      _d5 = vandq_f32_u32(_d5, vmask1);
+
+      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
+      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
+      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
+      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
+      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
+      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
+
+      switch (remain_k) {
+        case 3:
+          vst1q_f32(out_ptr + 12, _d2);
+          vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
+        case 2:
+          vst1q_f32(out_ptr + 6, _d1);
+          vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
+        case 1:
+          vst1q_f32(out_ptr, _d0);
+          vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
+        default:
+          break;
+      }
     }
   }
-  if (m_tail != 0) {
-    const float *a0 = A + i_length * lda;
+
+  int remain_m = m % 6;
+  if (remain_m) {
+    int remain_m_start = m - remain_m;
+    const float *a0 = A + remain_m_start * lda;
     const float *a1 = a0 + lda;
     const float *a2 = a0 + 2 * lda;
     const float *a3 = a0 + 3 * lda;
     const float *a4 = a0 + 4 * lda;
     const float *a5 = a0 + 5 * lda;
-    float *local_buffer = output + i_length * k;
-    switch (m_tail) {
-      case 1:
-        a1 = zero;
-      case 2:
-        a2 = zero;
-      case 3:
-        a3 = zero;
-      case 4:
-        a4 = zero;
-      case 5:
-        a5 = zero;
-        break;
-      default:
-        break;
+    float *out_ptr = output + remain_m_start * k;
+
+    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
+    uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m));
+    const float zerobuff[4] = {0.f, 0.f, 0.f, 0.f};
+
+    int lk = 0;
+    for (; lk < k - 3; lk += 4) {
+      switch (remain_m) {
+        case 1:
+          a1 = zerobuff;
+        case 2:
+          a2 = zerobuff;
+        case 3:
+          a3 = zerobuff;
+        case 4:
+          a4 = zerobuff;
+        case 5:
+          a5 = zerobuff;
+        default:
+          break;
+      }
+#if __aarch64__
+      float32x4_t _d0 = vld1q_f32(a0);
+      float32x4_t _d1 = vld1q_f32(a1);
+      float32x4_t _d2 = vld1q_f32(a2);
+      float32x4_t _d3 = vld1q_f32(a3);
+      float32x4_t _d4 = vld1q_f32(a4);
+      float32x4_t _d5 = vld1q_f32(a5);
+
+      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
+      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
+      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
+      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
+      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
+      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
+      _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
+
+      _d0 = vandq_f32_u32(_d0, vmask2);
+      _d1 = vandq_f32_u32(_d1, vmask2);
+      _d2 = vandq_f32_u32(_d2, vmask2);
+      _d3 = vandq_f32_u32(_d3, vmask2);
+      _d4 = vandq_f32_u32(_q3.val[0], vmask3);
+      _d5 = vandq_f32_u32(_q3.val[1], vmask3);
+
+      vst1q_f32(out_ptr, _d0);
+      vst1_f32(out_ptr + 4, vget_low_f32(_d4));
+      vst1q_f32(out_ptr + 6, _d1);
+      vst1_f32(out_ptr + 10, vget_low_f32(_d5));
+      vst1q_f32(out_ptr + 12, _d2);
+      vst1_f32(out_ptr + 16, vget_high_f32(_d4));
+      vst1q_f32(out_ptr + 18, _d3);
+      vst1_f32(out_ptr + 22, vget_high_f32(_d5));
+
+      out_ptr += 24;
+#else
+      asm volatile(
+          "vld1.32    {d0-d1}, [%[a0]]        \n"
+          "vld1.32    {d2-d3}, [%[a1]]        \n"
+          "vld1.32    {d4-d5}, [%[a2]]        \n"
+          "vld1.32    {d6-d7}, [%[a3]]        \n"
+          "vld1.32    {d8-d9}, [%[a4]]        \n"
+          "vld1.32    {d10-d11}, [%[a5]]      \n"
+          "vtrn.32    q0, q1                  \n"
+          "vtrn.32    q2, q3                  \n"
+          "vtrn.32    q4, q5                  \n"
+          "vswp.32    d1, d4                  \n"
+          "vswp.32    d3, d6                  \n"
+
+          "vbif       q0, %q[vzero], %q[vmask2] \n"
+          "vbif       q1, %q[vzero], %q[vmask2] \n"
+          "vbif       q2, %q[vzero], %q[vmask2] \n"
+          "vbif       q3, %q[vzero], %q[vmask2] \n"
+          "vbif       q4, %q[vzero], %q[vmask3] \n"
+          "vbif       q5, %q[vzero], %q[vmask3] \n"
+
+          "vst1.32    {q0}, [%[out]]!         \n"
+          "vst1.32    {d8}, [%[out]]!         \n"
+          "vst1.32    {q1}, [%[out]]!         \n"
+          "vst1.32    {d10}, [%[out]]!        \n"
+          "vst1.32    {q2}, [%[out]]!         \n"
+          "vst1.32    {d9}, [%[out]]!         \n"
+          "vst1.32    {q3}, [%[out]]!         \n"
+          "vst1.32    {d11}, [%[out]]!        \n"
+          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
+            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5)
+          : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+#endif
     }
-    for (int j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-      *local_buffer++ = *a4++;
-      *local_buffer++ = *a5++;
+    // remain k
+    for (; lk < k; ++lk) {
+      *out_ptr++ = *a0++;
+      *out_ptr++ = *a1++;
+      *out_ptr++ = *a2++;
+      *out_ptr++ = *a3++;
+      *out_ptr++ = *a4++;
+      *out_ptr++ = *a5++;
     }
-    delete[] zero;
   }
-
-  //  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
-  //  int remain_k = k & 0x3;
-  //  uint32x4_t vzero = vdupq_n_u32(0);
-  //  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
-  //
-  //  std::cout << "m: " << m << ", k: " << k << std::endl;
-  //  #pragma omp parallel for if (unroll)
-  //  for (int i = 0; i < m - 5; i += 6) {
-  //    std::cout << "i: " << i << std::endl;
-  //    const float *a0 = A + i * lda;
-  //    const float *a1 = A + (i + 1) * lda;
-  //    const float *a2 = A + (i + 2) * lda;
-  //    const float *a3 = A + (i + 3) * lda;
-  //    const float *a4 = A + (i + 4) * lda;
-  //    const float *a5 = A + (i + 5) * lda;
-  //    float *out_ptr = output + i * k;
-  //
-  //    int loops = k >> 2;
-  //    if (loops > 0) {
-  // #if __aarch64__
-  //      for (int l = 0; l < loops; ++l) {
-  //        float32x4_t _d0 = vld1q_f32(a0);
-  //        float32x4_t _d1 = vld1q_f32(a1);
-  //        float32x4_t _d2 = vld1q_f32(a2);
-  //        float32x4_t _d3 = vld1q_f32(a3);
-  //        float32x4_t _d4 = vld1q_f32(a4);
-  //        float32x4_t _d5 = vld1q_f32(a5);
-  //
-  //        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-  //        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-  //        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-  //        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
-  //        vget_low_f32(_q1.val[0])); _d1 =
-  //        vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-  //        _d2 =
-  //            vcombine_f32(vget_high_f32(_q0.val[0]),
-  //            vget_high_f32(_q1.val[0]));
-  //        _d3 =
-  //            vcombine_f32(vget_high_f32(_q0.val[1]),
-  //            vget_high_f32(_q1.val[1]));
-  //
-  //        vst1q_f32(out_ptr, _d0);
-  //        vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-  //        vst1q_f32(out_ptr + 6, _d1);
-  //        vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-  //        vst1q_f32(out_ptr + 12, _d2);
-  //        vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-  //        vst1q_f32(out_ptr + 18, _d3);
-  //        vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
-  //
-  //        a0 += 4;
-  //        a1 += 4;
-  //        a2 += 4;
-  //        a3 += 4;
-  //        a4 += 4;
-  //        a5 += 4;
-  //        out_ptr += 24;
-  //      }
-  // #else
-  //      asm volatile(
-  //          "loop_4k_%=:                        \n"
-  //          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-  //          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-  //          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-  //          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-  //          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-  //          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-  //          "vtrn.32    q0, q1                  \n"
-  //          "vtrn.32    q2, q3                  \n"
-  //          "vtrn.32    q4, q5                  \n"
-  //          "vswp.32    d1, d4                  \n"
-  //          "vswp.32    d3, d6                  \n"
-  //
-  //          "vst1.32    {q0}, [%[out]]!         \n"
-  //          "vst1.32    {d8}, [%[out]]!         \n"
-  //          "vst1.32    {q1}, [%[out]]!         \n"
-  //          "vst1.32    {d10}, [%[out]]!        \n"
-  //          "vst1.32    {q2}, [%[out]]!         \n"
-  //          "vst1.32    {d9}, [%[out]]!         \n"
-  //          "vst1.32    {q3}, [%[out]]!         \n"
-  //          "vst1.32    {d11}, [%[out]]!        \n"
-  //
-  //          "subs       %[loops], #1            \n"
-  //          "bne        loop_4k_%=              \n"
-  //          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
-  //          "+r"(a2),
-  //            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-  //          :
-  //          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-  // #endif
-  //    }
-  //
-  //    if (remain_k > 0) {
-  //      float32x4_t _d0 = vld1q_f32(a0);
-  //      float32x4_t _d1 = vld1q_f32(a1);
-  //      float32x4_t _d2 = vld1q_f32(a2);
-  //      float32x4_t _d3 = vld1q_f32(a3);
-  //      float32x4_t _d4 = vld1q_f32(a4);
-  //      float32x4_t _d5 = vld1q_f32(a5);
-  //
-  //      _d0 = vandq_f32_u32(_d0, vmask1);
-  //      _d1 = vandq_f32_u32(_d1, vmask1);
-  //      _d2 = vandq_f32_u32(_d2, vmask1);
-  //      _d3 = vandq_f32_u32(_d3, vmask1);
-  //      _d4 = vandq_f32_u32(_d4, vmask1);
-  //      _d5 = vandq_f32_u32(_d5, vmask1);
-  //
-  //      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-  //      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-  //      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-  //      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
-  //      vget_low_f32(_q1.val[0])); _d1 =
-  //      vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
-  //      = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-  //
-  //      switch (remain_k) {
-  //        case 3:
-  //          vst1q_f32(out_ptr + 12, _d2);
-  //          vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-  //        case 2:
-  //          vst1q_f32(out_ptr + 6, _d1);
-  //          vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-  //        case 1:
-  //          vst1q_f32(out_ptr, _d0);
-  //          vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-  //        default:
-  //          break;
-  //      }
-  //    }
-  //  }
-  //
-  //  int remain_m = m % 6;
-  //  if (remain_m) {
-  //    int remain_m_start = m - remain_m;
-  //    std::cout << "remain_m_start: " << remain_m_start << std::endl;
-  //    const float *a0 = A + remain_m_start * lda;
-  //    const float *a1 = a0 + lda;
-  //    const float *a2 = a0 + 2 * lda;
-  //    const float *a3 = a0 + 3 * lda;
-  //    const float *a4 = a0 + 4 * lda;
-  //    const float *a5 = a0 + 5 * lda;
-  //    float *out_ptr = output + remain_m_start * k;
-  //
-  //    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
-  //    uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4),
-  //    vdupq_n_u32(remain_m));
-  //
-  //    int loops = k >> 2;
-  //    if (loops > 0) {
-  // #if __aarch64__
-  //      for (int l = 0; l < loops; ++l) {
-  //        float32x4_t _d0 = vld1q_f32(a0);
-  //        float32x4_t _d1 = vld1q_f32(a1);
-  //        float32x4_t _d2 = vld1q_f32(a2);
-  //        float32x4_t _d3 = vld1q_f32(a3);
-  //        float32x4_t _d4 = vld1q_f32(a4);
-  //        float32x4_t _d5 = vld1q_f32(a5);
-  //
-  //        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-  //        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-  //        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-  //        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
-  //        vget_low_f32(_q1.val[0])); _d1 =
-  //        vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-  //        _d2 =
-  //            vcombine_f32(vget_high_f32(_q0.val[0]),
-  //            vget_high_f32(_q1.val[0]));
-  //        _d3 =
-  //            vcombine_f32(vget_high_f32(_q0.val[1]),
-  //            vget_high_f32(_q1.val[1]));
-  //
-  //        _d0 = vandq_f32_u32(_d0, vmask2);
-  //        _d1 = vandq_f32_u32(_d1, vmask2);
-  //        _d2 = vandq_f32_u32(_d2, vmask2);
-  //        _d3 = vandq_f32_u32(_d3, vmask2);
-  //        _d4 = vandq_f32_u32(_q3.val[0], vmask3);
-  //        _d5 = vandq_f32_u32(_q3.val[1], vmask3);
-  //
-  //        vst1q_f32(out_ptr, _d0);
-  //        vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-  //        vst1q_f32(out_ptr + 6, _d1);
-  //        vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-  //        vst1q_f32(out_ptr + 12, _d2);
-  //        vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-  //        vst1q_f32(out_ptr + 18, _d3);
-  //        vst1_f32(out_ptr + 22, vget_high_f32(_d5));
-  //
-  //        a0 += 4;
-  //        a1 += 4;
-  //        a2 += 4;
-  //        a3 += 4;
-  //        a4 += 4;
-  //        a5 += 4;
-  //        out_ptr += 24;
-  //      }
-  // #else
-  //      asm volatile(
-  //          "loop_4k_%=:                        \n"
-  //          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-  //          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-  //          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-  //          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-  //          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-  //          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-  //          "vtrn.32    q0, q1                  \n"
-  //          "vtrn.32    q2, q3                  \n"
-  //          "vtrn.32    q4, q5                  \n"
-  //          "vswp.32    d1, d4                  \n"
-  //          "vswp.32    d3, d6                  \n"
-  //
-  //          "vbif       q0, %q[vzero], %q[vmask2] \n"
-  //          "vbif       q1, %q[vzero], %q[vmask2] \n"
-  //          "vbif       q2, %q[vzero], %q[vmask2] \n"
-  //          "vbif       q3, %q[vzero], %q[vmask2] \n"
-  //          "vbif       q4, %q[vzero], %q[vmask3] \n"
-  //          "vbif       q5, %q[vzero], %q[vmask3] \n"
-  //
-  //          "vst1.32    {q0}, [%[out]]!         \n"
-  //          "vst1.32    {d8}, [%[out]]!         \n"
-  //          "vst1.32    {q1}, [%[out]]!         \n"
-  //          "vst1.32    {d10}, [%[out]]!        \n"
-  //          "vst1.32    {q2}, [%[out]]!         \n"
-  //          "vst1.32    {d9}, [%[out]]!         \n"
-  //          "vst1.32    {q3}, [%[out]]!         \n"
-  //          "vst1.32    {d11}, [%[out]]!        \n"
-  //
-  //          "subs       %[loops], #1            \n"
-  //          "bne        loop_4k_%=              \n"
-  //          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
-  //          "+r"(a2),
-  //            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-  //          : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
-  //          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-  // #endif
-  //    }
-  //
-  //    if (remain_k > 0) {
-  //      float32x4_t _d0 = vld1q_f32(a0);
-  //      float32x4_t _d1 = vld1q_f32(a1);
-  //      float32x4_t _d2 = vld1q_f32(a2);
-  //      float32x4_t _d3 = vld1q_f32(a3);
-  //      float32x4_t _d4 = vld1q_f32(a4);
-  //      float32x4_t _d5 = vld1q_f32(a5);
-  //
-  //      _d0 = vandq_f32_u32(_d0, vmask1);
-  //      _d1 = vandq_f32_u32(_d1, vmask1);
-  //      _d2 = vandq_f32_u32(_d2, vmask1);
-  //      _d3 = vandq_f32_u32(_d3, vmask1);
-  //      _d4 = vandq_f32_u32(_d4, vmask1);
-  //      _d5 = vandq_f32_u32(_d5, vmask1);
-  //
-  //      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-  //      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-  //      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-  //      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
-  //      vget_low_f32(_q1.val[0])); _d1 =
-  //      vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
-  //      = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-  //      // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]),
-  //      //                    vget_high_f32(_q1.val[1]));
-  //
-  //      _d0 = vandq_f32_u32(_d0, vmask2);
-  //      _d1 = vandq_f32_u32(_d1, vmask2);
-  //      _d2 = vandq_f32_u32(_d2, vmask2);
-  //      // _d3 = vandq_f32_u32(_d3, vmask2);
-  //      _d4 = vandq_f32_u32(_q3.val[0], vmask3);
-  //      _d5 = vandq_f32_u32(_q3.val[1], vmask3);
-  //
-  //      switch (remain_k) {
-  //        case 3:
-  //          vst1q_f32(out_ptr + 12, _d2);
-  //          vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-  //        case 2:
-  //          vst1q_f32(out_ptr + 6, _d1);
-  //          vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-  //        case 1:
-  //          vst1q_f32(out_ptr, _d0);
-  //          vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-  //        default:
-  //          break;
-  //      }
-  //    }
-  //  }
 }
 
 #if __aarch64__
diff --git a/src/operators/math/math_func_neon.h b/src/operators/math/math.h
similarity index 100%
rename from src/operators/math/math_func_neon.h
rename to src/operators/math/math.h
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index 6b34f522ff6caf32c20971d9cf38f93730fdb727..e066b0cccddf9a43953182788508aca4769fcd27 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <algorithm>
 #include <limits>
 #include "common/types.h"
-#include "operators/math/math_func_neon.h"
+#include "operators/math/math.h"
 
 namespace paddle_mobile {
 namespace operators {