update

dd575b09 · hjchen2 · 42e520bb · dd575b09 · dd575b09 · dd575b09
49 changed file
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
 LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_relu);
 #endif
-#ifdef FUSION_CONVADDADDPRELU_OP
-LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
-#endif
 #ifdef FUSION_CONVADD_OP
 LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
 LOAD_FUSION_MATCHER(fusion_conv_add);
@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
 #ifdef DROPOUT_OP
 LOAD_OP2(dropout, CPU, FPGA);
 #endif
-#ifdef FUSION_CONVADDPRELU_OP
-LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
-#endif
 #ifdef FUSION_DWCONVBNRELU_OP
 LOAD_OP1(fusion_dwconv_bn_relu, CPU);
 LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_add_prelu_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDADDPRELU_OP
-#include "operators/fusion_conv_add_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename Dtype, typename T>
-void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
-                        ops::FusionConvAddAddPReluOpMatcher);
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-#endif  // FUSION_CONVADDADDPRELU_OP
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDADDPRELU_OP
-#pragma once
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_add_prelu_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddAddPReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
-  }
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD,
-                   {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
-                 removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
-  std::vector<std::pair<int, std::string>> NeedCheck() {
-    DLOG << " conv add add prelu check add X ";
-    return {{2, "Y"}, {2, "X"}};
-  }
-};
-template <typename DeviceType, typename T>
-class FusionConvAddAddPReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-          operators::ConvAddAddPReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-            operators::ConvAddAddPReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP
 #include "operators/fusion_conv_add_bn_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 #include "operators/fusion_conv_add_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDPRELU_OP
-#include "operators/fusion_conv_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename Dtype, typename T>
-void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
-                        ops::FusionConvAddPReluOpMatcher);
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-#endif
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDPRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_prelu_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddPReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
-  }
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
-                 removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
-};
-template <typename DeviceType, typename T>
-class FusionConvAddPReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddPReluParam<DeviceType>,
-          operators::ConvAddPReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddPReluParam<DeviceType>,
-            operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-  void InferShape() const override;
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP
 #include "operators/fusion_conv_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);

--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNADDRELU_OP
 #include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP
 #include "operators/fusion_conv_bn_op.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 #include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_DWCONVBNRELU_OP
 #include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                                          dilations[i], paddings[i],
-                             paddings[i], strides[i]));
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDADDPRELU_OP
-#include "operators/kernel/conv_add_add_prelu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddAddPReluKernel<CPU, float>::Init(
-    FusionConvAddAddPReluParam<CPU> *param) {
-  return true;
-}
-template <>
-void ConvAddAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddAddPReluParam<CPU> &param) {
-  ConvAddAddPReluCompute<float>(param);
-}
-template class ConvAddAddPReluKernel<CPU, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 namespace paddle_mobile {
 namespace operators {
@@ -62,34 +63,24 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
    const FusionConvAddBNReluParam<CPU> &param) {
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
+      DepthwiseConv3x3<float, float>(param);
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionConvAddBNReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 template class ConvAddBNReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "operators/kernel/conv_add_kernel.h"
 #include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_add_arm_func.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 namespace paddle_mobile {
 namespace operators {
@@ -32,34 +32,25 @@ template <>
 void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
                                             param.Paddings(), param.Output());
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
      break;
    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvAddBasic(param);
+      GemmConv<float, float>(param);
      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
+  math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(), param.Output());
 }
 template class ConvAddKernel<CPU, float>;

--- a/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDPRELU_OP
-#include "operators/kernel/conv_add_prelu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
-  return true;
-}
-template <>
-void ConvAddPReluKernel<CPU, float>::Compute(
-    const FusionConvAddPReluParam<CPU> &param) {
-  ConvAddPReluCompute<float>(param);
-}
-template class ConvAddPReluKernel<CPU, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 namespace paddle_mobile {
 namespace operators {
@@ -32,30 +33,23 @@ void ConvAddReluKernel<CPU, float>::Compute(
    const FusionConvAddReluParam<CPU> &param) {
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
-      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
+      DepthwiseConv3x3<float, float>(param);
-                                             param.Paddings(), param.Output());
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvAddReluBasic<FusionConvAddReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
+  math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
 }
 template class ConvAddReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 namespace paddle_mobile {
 namespace operators {
@@ -62,34 +63,24 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
    const FusionConvBNAddReluParam<CPU> &param) {
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
+      DepthwiseConv3x3<float, float>(param);
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionConvBNAddReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 template class ConvBNAddReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 namespace paddle_mobile {
 namespace operators {
@@ -61,34 +62,24 @@ void ConvBNReluKernel<CPU, float>::Compute(
    const FusionConvBNReluParam<CPU> &param) {
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
+      DepthwiseConv3x3<float, float>(param);
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionConvBNReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 template class ConvBNReluKernel<CPU, float>;

--- a/src/operators/kernel/arm/convolution/conv_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_kernel.cpp
@@ -32,10 +32,10 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
  switch (param.ExecMode()) {
+#ifndef __aarch64__
    case ConvParam<CPU>::EXEC_GEMM_INT8:
      GemmConv<int8_t, int32_t>(param);
      break;
-#ifndef __aarch64__
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8:
      DepthwiseConv3x3<int8_t, int32_t>(param);
      break;
@@ -44,12 +44,8 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
      break;
 #endif  // __aarch64__
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
+      DepthwiseConv3x3<float, float>(param);
-                                             param.Paddings(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);

--- a/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
+#include "operators/math/channel_wise.h"
 namespace paddle_mobile {
 namespace operators {
@@ -61,37 +62,28 @@ void DWConvBNReluKernel<CPU, float>::Compute(
    const FusionDWConvBNReluParam<CPU> &param) {
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      math::DepthwiseConv3x3S1<float, float>(*param.Input(), *param.Filter(),
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
-      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      math::DepthwiseConv3x3S2<float, float>(*param.Input(), *param.Filter(),
+      DepthwiseConv3x3<float, float>(param);
-                                             param.Paddings(), param.Output());
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
 #ifndef __aarch64__
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
      WinogradConv3x3<8, 3>(param);
-      math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                      param.NewBias(), param.Output());
      break;
 #endif  // __aarch64__
    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      ConvBNReluBasic<FusionDWConvBNReluParam<CPU>>(param);
+      GemmConv<float, float>(param);
      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
+  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                  param.NewBias(), param.Output());
 }
 template class DWConvBNReluKernel<CPU, float>;
 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDADDPRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor bias1 = *param.Bias1();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  Tensor aa = *param.InputAlpha();
-  float *p = aa.data<float>();
-  std::string mode = param.Mode();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    Tensor bias1_batch = bias1.Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
-      float *biase_data1 = bias1_slice.data<float>();
-      math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
-                            p, mode, biase_data, biase_data1);
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_CONVADDADDPRELU_OP
--- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDPRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  Tensor aa = *param.InputAlpha();
-  float *p = aa.data<float>();
-  std::string mode = param.Mode();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
-                            p, mode, biase_data, nullptr);
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_CONVADDPRELU_OP
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -12,38 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#pragma once
 #include <vector>
-#include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv3x3.h"
+#include "operators/math/depthwise_conv5x5.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
+#include "operators/math/pad.h"
 #include "operators/math/vol2col.h"
+#include "operators/math/winograd/winograd_transform.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
+int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
+                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+bool IsExpand(const std::vector<int64_t> &filter_dim,
+              const std::vector<int> &strides, const std::vector<int> &paddings,
+              const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+template <typename Itype, typename Otype>
+void GemmConv(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
  Tensor *output = param.Output();
-  output->mutable_data<float>();
+  output->mutable_data<Otype>();
-  float *biase_data = bias.data<float>();
-  int axis = param.Axis();
  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
+  const std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
+  const std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
+  const std::vector<int> dilations = param.Dilations();
-  const int batch_size = static_cast<int>(input->dims()[0]);
  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
  size_t data_dim = filter_shape_vec.size() - 2;
  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
@@ -57,12 +73,11 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
  framework::DDim col_matrix_shape =
      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<float>(col_shape);
+    col.mutable_data<Itype>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
@@ -81,9 +96,10 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
  int in_step = static_cast<int>(input->dims()[1]) / groups;
  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
+  const int batch_size = static_cast<int>(input->dims()[0]);
  for (int i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -92,8 +108,8 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
      if (!is_expand) {
-        col.ShareDataWith(in_slice);
+        // col_matrix.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
+        col_matrix = in_slice;
        col_matrix.Resize(col_matrix_shape);
      } else if (data_dim == 2U) {
        // im2col
@@ -105,17 +121,122 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
        // vol2col
        vol2col(in_slice, dilations, strides, paddings, &col);
      }
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMul<float, float>(filter_slice, false, col_matrix, false,
+      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(1), false, biase_data);
+                                 static_cast<float>(0), false,
+                                 static_cast<Otype *>(nullptr));
    }
  }
 }
-}  // namespace operators
+template <int tile, int kernel>
-}  // namespace paddle_mobile
+void WinogradConv3x3(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  const Tensor *filter = param.transformed_filter_;
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int batch_size = input->dims()[0];
+  int groups = param.Groups();
+  const std::vector<int> &paddings = param.Paddings();
+  auto winograd_pad = [&](int width, int pad) {
+    int output_tile = tile - kernel + 1;
+    // int tiles = (width + pad - kernel) / output_tile + 1;
+    // return (tiles - 1) * output_tile + tile - width;
+    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
+    return pad_width + tile - width;
+  };
+  math::PadFunctor<CPU, float> pad;
+  Tensor input_pad;
+  framework::Tensor transformed_input;
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor in_batch = input->Slice(i, i + 1);
+    Tensor out_batch = output->Slice(i, i + 1);
+    // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
+    // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
+    int pad_bottom = paddings[0];
+    int pad_right = paddings[1];
+    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
+      framework::DDim pad_shape = in_batch.dims();
+      pad_shape[2] += paddings[0] + pad_bottom;
+      pad_shape[3] += paddings[1] + pad_right;
+      input_pad.mutable_data<float>(pad_shape);
+      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
+          &input_pad);
+    } else {
+      input_pad = in_batch;
+    }
+    // tile input and transform
+    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
+    // caculate output
+    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
+                                                  output);
+  }
+}
+template <typename Itype, typename Otype>
+void DepthwiseConv3x3(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  const Tensor *filter = param.Filter();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &strides = param.Strides();
+  const int batch_size = input->dims()[0];
+  Tensor *output = param.Output();
+  output->mutable_data<Otype>();
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1);
+    Tensor out_batch = output->Slice(i, i + 1);
+    if (strides[0] == 1) {
+      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
+                                             &out_batch);
+    } else if (strides[0] == 2) {
+      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
+                                             &out_batch);
+    } else {
+      GemmConv<Itype, Otype>(param);
+    }
+  }
+}
+template <typename Itype, typename Otype>
+void DepthwiseConv5x5(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  const Tensor *filter = param.Filter();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &strides = param.Strides();
+  const int batch_size = input->dims()[0];
+  Tensor *output = param.Output();
+  output->mutable_data<Otype>();
+  //  if (strides[0] == 1) {
+  //    for (int i = 0; i < batch_size; i++) {
+  //      Tensor in_batch = input->Slice(i, i + 1);
+  //      Tensor out_batch = output->Slice(i, i + 1);
+  //      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
+  //                                             &out_batch);
+  //    }
+  //  } else {
+  GemmConv<Itype, Otype>(param);
+  //  }
+}
+template void GemmConv<float, float>(const ConvParam<CPU> &param);
+template void WinogradConv3x3<8, 3>(const ConvParam<CPU> &param);
+template void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param);
+template void DepthwiseConv5x5<float, float>(const ConvParam<CPU> &param);
+#ifndef __aarch64__
+template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
+template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
+template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
 #endif
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -15,386 +15,31 @@ limitations under the License. */
 #ifdef CONV_OP
 #pragma once
 #include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/depthwise_conv5x5.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/pad.h"
-#include "operators/math/vol2col.h"
-#include "operators/math/winograd/winograd_transform.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename Itype, typename Otype>
+int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-inline void GemmConv(const ConvParam<CPU> &param) {
+                   int stride);
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-  int groups = param.Groups();
-  const std::vector<int> strides = param.Strides();
-  const std::vector<int> paddings = param.Paddings();
-  const std::vector<int> dilations = param.Dilations();
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<Itype>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        // col_matrix.ShareDataWith(in_slice);
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
+bool IsExpand(const std::vector<int64_t> &filter_dim,
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+              const std::vector<int> &strides, const std::vector<int> &paddings,
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+              const std::vector<int> &dilations);
-      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
+template <typename Itype, typename Otype>
-                                 static_cast<float>(1), &out_slice,
+void GemmConv(const ConvParam<CPU> &param);
-                                 static_cast<float>(0), false,
-                                 static_cast<Otype *>(nullptr));
-    }
-  }
-}
 template <int tile, int kernel>
-inline void WinogradConv3x3(const ConvParam<CPU> &param) {
+void WinogradConv3x3(const ConvParam<CPU> &param);
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.transformed_filter_;
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int batch_size = input->dims()[0];
-  int groups = param.Groups();
-  const std::vector<int> &paddings = param.Paddings();
-  auto winograd_pad = [&](int width, int pad) {
-    int output_tile = tile - kernel + 1;
-    // int tiles = (width + pad - kernel) / output_tile + 1;
-    // return (tiles - 1) * output_tile + tile - width;
-    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
-    return pad_width + tile - width;
-  };
-  math::PadFunctor<CPU, float> pad;
-  Tensor input_pad;
-  framework::Tensor transformed_input;
-  for (int i = 0; i < batch_size; ++i) {
-    Tensor in_batch = input->Slice(i, i + 1);
-    Tensor out_batch = output->Slice(i, i + 1);
-    // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
-    // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
-    int pad_bottom = paddings[0];
-    int pad_right = paddings[1];
-    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
-      framework::DDim pad_shape = in_batch.dims();
-      pad_shape[2] += paddings[0] + pad_bottom;
-      pad_shape[3] += paddings[1] + pad_right;
-      input_pad.mutable_data<float>(pad_shape);
-      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
-          &input_pad);
-    } else {
-      input_pad = in_batch;
-    }
-    // tile input and transform
-    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
-    // caculate output
-    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
-                                                  output);
-  }
-}
-#ifndef __aarch64__
-// int8 DepthwiseConv3x3
 template <typename Itype, typename Otype>
-inline void DepthwiseConv3x3(const ConvParam<CPU> &param) {
+void DepthwiseConv3x3(const ConvParam<CPU> &param);
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1);
-    Tensor out_batch = output->Slice(i, i + 1);
-    if (strides[0] == 1) {
-      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    } else if (strides[0] == 2) {
-      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    } else {
-      GemmConv<Itype, Otype>(param);
-    }
-  }
-}
-#endif  // __aarch64__
 template <typename Itype, typename Otype>
-inline void DepthwiseConv5x5(const ConvParam<CPU> &param) {
+void DepthwiseConv5x5(const ConvParam<CPU> &param);
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-  //  if (strides[0] == 1) {
-  //    for (int i = 0; i < batch_size; i++) {
-  //      Tensor in_batch = input->Slice(i, i + 1);
-  //      Tensor out_batch = output->Slice(i, i + 1);
-  //      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
-  //                                             &out_batch);
-  //    }
-  //  } else {
-  GemmConv<Itype, Otype>(param);
-  //  }
-}
-template <typename ParamType>
-void ConvAddReluBasic(const ParamType &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float alpha = 1.0f;
-  float beta = 1.0f;
-  int32_t groups = param.Groups();
-  int32_t axis = param.Axis();
-  std::vector<int32_t> strides = param.Strides();
-  std::vector<int32_t> paddings = param.Paddings();
-  std::vector<int32_t> dilations = param.Dilations();
-  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
-  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-  float *bias_data = bias.data<float>();
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int32_t i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int32_t g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMul<float, float>(filter_slice, false, col_matrix, false, alpha,
-                                 &out_slice, beta, true, bias_data);
-    }
-  }
-}
-template <typename ParamType>
-void ConvBNReluBasic(const ParamType &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVBNADDRELU_OP
-#pragma once
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *bias1 = param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(1), true, &new_scale, &new_bias, g,
-                         bias_data.data<float>());
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef FUSION_CONVADDADDPRELU_OP
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using framework::DDim;
-using framework::OpKernelBase;
-template <typename DeviceType, typename T>
-class ConvAddAddPReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param);
-  bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "common/common.h"
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"

--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef FUSION_CONVADDPRELU_OP
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using framework::DDim;
-using framework::OpKernelBase;
-template <typename DeviceType, typename T>
-class ConvAddPReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddPReluParam<DeviceType> &param);
-  bool Init(FusionConvAddPReluParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "framework/ddim.h"
 #include "framework/operator.h"
-#include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -15,24 +15,21 @@ limitations under the License. */
 #pragma once
 #ifdef LRN_OP
+#include <cmath>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include "framework/operator.h"
-#include "operators/op_param.h"
-#include <cmath>
 #ifdef __ARM_NEON
-#include "arm_neon.h"
+#include <arm_neon.h>
-#include "operators/math/math_func_neon.h"
+#include "operators/math/math.h"
 #endif
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
 template <typename T>
 struct LRNFunctor {
  void operator()(const framework::Tensor &input, framework::Tensor *out, int N,

--- a/src/operators/math/activation.h
+++ b/src/operators/math/activation.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "common/types.h"
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
-#include "operators/math/math_func_neon.h"
+#include "operators/math/math.h"
 #endif
 namespace paddle_mobile {

--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
@@ -14,91 +14,16 @@ limitations under the License. */
 #pragma once
-#include <vector>
+#include "framework/tensor.h"
+#include "operators/math/activation.h"
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#include "framework/ddim.h"
-#include "framework/tensor.h"
-#include "operators/math/activation.h"
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-using framework::DDim;
-using framework::Tensor;
-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {  // NOLINT
-  const auto bias_ptr = bias.data<float>();
-  const DDim bias_ddim = bias.dims();
-  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
-                        "the bias tensor's dims size != 1")
-  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
-  int outer_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-  bias.Resize(dDim);
-  auto new_ptr = bias.mutable_data<float>();
-  int axis_size = dDim[axis];
-#ifdef __ARM_NEON
-  for (int i = 0; i < outer_size; ++i) {
-    int inner_num = inner_size >> 4;
-    int remain = inner_size - (inner_num << 4);
-    float v_bias = bias_ptr[i * axis_size / outer_size];
-    for (; inner_num > 0; inner_num--) {
-      float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
-      vst1q_f32(new_ptr, v_newptr1);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr2);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr3);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr4);
-      new_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *new_ptr = v_bias;
-      new_ptr++;
-    }
-  }
-#else
-  for (int i = 0; i < outer_size; ++i) {
-    float v_bias = bias_ptr[i * axis_size / outer_size];
-    for (int j = 0; j < inner_size; ++j) {
-      new_ptr[i * inner_size + j] = v_bias;
-    }
-  }
-#endif
-}
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
 template <ActivationType Act>
 void AddChannelWise(const framework::Tensor *input,
                    const framework::Tensor *bias, framework::Tensor *output) {

--- a/src/operators/math/depthwise_conv3x3.h
+++ b/src/operators/math/depthwise_conv3x3.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"
-#include "operators/math/conv_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/math/depthwise_conv3x3_int8.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(__ARM_NEON__) && !defined(__aarch64__)
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #include "operators/math/depthwise_conv3x3.h"
@@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
  DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
  // middle
  int remain_start = valid_w_start;
-#ifdef __ARM_NEON__
  int output_tiles = (valid_w_end - valid_w_start) / 6;
  remain_start = valid_w_start + output_tiles * 6;
  int32x4_t _sum0, _sum1;
@@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
    vst1q_s32(output_ptr + output_offset, _sum0);
    vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1));
  }
-#endif  // __ARM_NEON__
  for (int w = remain_start; w < valid_w_end; ++w) {
    int32_t value = 0;
    int input_start = -padding_w + w * Stride_w;
@@ -215,6 +213,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
        output_ptr2 += valid_w_start;
        output_ptr3 += valid_w_start;
      }
+#if __aarch64__
+#else
      // valid
      int loop = output_w_tiles;
      asm volatile(
@@ -525,6 +525,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
            "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
      // pad right
      if (padding_w) {
        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
@@ -618,7 +619,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
        output_ptr0 += valid_w_start;
        output_ptr1 += valid_w_start;
      }
-      // valid
+        // valid
+#if __aarch64__
+#else
      int loop = output_w_tiles;
      asm volatile(
          "cmp        %[loop], #0                  \n"
@@ -804,6 +807,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
            "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
      // pad right
      if (padding_w) {
        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
@@ -869,7 +873,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
        }
        output_ptr0 += valid_w_start;
      }
-      // valid
+        // valid
+#if __aarch64__
+#else
      int loop = output_w_tiles;
      asm volatile(
          "cmp        %[loop], #0                  \n"
@@ -993,6 +999,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
            "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
      // pad right
      if (padding_w) {
        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
@@ -1152,7 +1159,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
        output_ptr1 += valid_w_start;
        output_ptr2 += valid_w_start;
      }
-      // valid
+        // valid
+#if __aarch64__
+#else
      int loop = output_w_tiles;
      asm volatile(
          "cmp        %[loop], #0                     \n"
@@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
            "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
      // pad right
      if (padding_w > 0) {
        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
@@ -1490,7 +1500,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
        input_ptr2 += valid_input_w_start;
        output_ptr0 += valid_w_start;
      }
-      // valid
+        // valid
+#if __aarch64__
+#else
      int loop = output_w_tiles;
      asm volatile(
          "cmp        %[loop], #0                      \n"
@@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
            "q12", "q13", "q14", "q15", "r0");
+#endif  // __aarch64__
      // pad right
      if (padding_w > 0) {
        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
@@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
 }  // namespace operators
 }  // namespace paddle_mobile
-#endif
+#endif  // __ARM_NEON__
--- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#if defined(__ARM_NEON__) && defined(__aarch64__)
-#include "operators/math/depthwise_conv3x3.h"
-#ifdef __ARM_NEON__
-#include <arm_neon.h>
-#endif
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-// template<>
-// void DepthwiseConv3x3<int8_t, int32_t>(
-//     const framework::Tensor *input, const framework::Tensor *filter,
-//     const std::vector<int> &strides, framework::Tensor *output) {
-//   PADDLE_MOBILE_THROW_EXCEPTION(
-//       "Depthwise conv with generic strides has not been implemented.");
-// }
-template <>
-void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Depthwise conv3x3 with stride 1 for arm v8 has not been implemented.");
-}
-template <>
-void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Depthwise conv3x3 with stride 2 for arm v8 has not been implemented.");
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/math/depthwise_conv5x5.h
+++ b/src/operators/math/depthwise_conv5x5.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"
-#include "operators/math/conv_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/math/gemm/pack_kernel.h
+++ b/src/operators/math/gemm/pack_kernel.h
--- a/src/operators/math/math_func_neon.h
+++ b/src/operators/math/math_func_neon.h
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <algorithm>
 #include <limits>
 #include "common/types.h"
-#include "operators/math/math_func_neon.h"
+#include "operators/math/math.h"
 namespace paddle_mobile {
 namespace operators {