Merge pull request #1344 from hjchen2/dev-latest

Refactor pooling and dequant fusion implementation, fix some code style

Merge pull request #1344 from hjchen2/dev-latest
Refactor pooling and dequant fusion implementation, fix some code style
982531ef · Ray Liu · GitHub · c203ecac · e614a0a7 · 982531ef
71 changed file
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #ifdef ENABLE_EXCEPTION
 #include <stdio.h>
+#include <stdlib.h>
 #include <exception>
 #include <string>
 #endif
 namespace paddle_mobile {

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -24,7 +24,6 @@ const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
 const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
@@ -32,7 +31,6 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
 const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
 const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
 const char *G_OP_TYPE_FC = "fusion_fc";
-const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
@@ -41,6 +39,7 @@ const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
 const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RELU6 = "relu6";
 const char *G_OP_TYPE_RESHAPE = "reshape";
 const char *G_OP_TYPE_RESHAPE2 = "reshape2";
 const char *G_OP_TYPE_SIGMOID = "sigmoid";
@@ -73,9 +72,14 @@ const char *G_OP_TYPE_SUM = "sum";
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
 const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT =
+    "fusion_dequant_add_bn_quant";
+const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT =
+    "fusion_dequant_add_bn_relu_quant";
 const char *G_OP_TYPE_TANH = "tanh";
 const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
@@ -91,6 +95,7 @@ std::unordered_map<
        {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_RELU6, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
@@ -112,13 +117,11 @@ std::unordered_map<
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
-        {G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
@@ -142,9 +145,14 @@ std::unordered_map<
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
+         {{"X", "Scale"}, {"Out", "OutScale"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT,
+         {{"X", "Scale"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -87,10 +87,24 @@ enum PMStatus {
 };
 enum RoundType {
-  ROUND_UNK = 0,
+  ROUND_NEAREST_AWAY_ZERO = 0,
-  ROUND_NEAREST_AWAY_ZERO = 1,
+  ROUND_NEAREST_TOWARDS_ZERO = 1,
-  ROUND_NEAREST_TOWARDS_ZERO = 2,
+  ROUND_NEAREST_TO_EVEN = 2,
-  ROUND_NEAREST_TO_EVEN = 3
+};
+enum ActivationType {
+  IDENTITY = 0,
+  RELU = 1,
+  RELU6 = 2,
+  PRELU = 3,
+  LEAKY_RELU = 4,
+  TANH = 5,
+  SIGMOID = 6,
+};
+enum PoolingType {
+  MAX = 0,
+  AVG = 1,
 };
 extern const char *G_OP_TYPE_CONV;
@@ -99,11 +113,9 @@ extern const char *G_OP_TYPE_BOX_CODER;
 extern const char *G_OP_TYPE_CONCAT;
 extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
 extern const char *G_OP_TYPE_FC;
-extern const char *G_OP_TYPE_FC_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
@@ -116,6 +128,7 @@ extern const char *G_OP_TYPE_MULTICLASS_NMS;
 extern const char *G_OP_TYPE_POOL2D;
 extern const char *G_OP_TYPE_PRIOR_BOX;
 extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RELU6;
 extern const char *G_OP_TYPE_RESHAPE;
 extern const char *G_OP_TYPE_SIGMOID;
 extern const char *G_OP_TYPE_SOFTMAX;
@@ -140,9 +153,12 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT;
 extern const char *G_OP_TYPE_TANH;
 extern const char *G_OP_TYPE_FUSION_DECONV_RELU;

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -302,7 +302,15 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    _tp[ops[i]->Type()] += timeCost;
+    if (ops[i]->Type() == "conv2d") {
+      auto inputs = ops[i]->Inputs();
+      auto *filter = framework::GetVarValue<framework::LoDTensor>(
+          "Filter", inputs, *(program_.scope));
+      int kernel_size = filter->dims()[2];
+      _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
+    } else {
+      _tp[ops[i]->Type()] += timeCost;
+    }
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
@@ -372,6 +380,14 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
+    if (ops[i]->Type() == "conv2d") {
+      auto inputs = ops[i]->Inputs();
+      auto input_keys = ops[i]->GetInputKeys();
+      auto *filter = framework::GetVarValue<framework::LoDTensor>(
+          input_keys[1], inputs, *(program_.scope));
+      int kernel_size = filter->dims()[2];
+      printf("kernel size: %d\n", kernel_size);
+    }
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -191,6 +191,7 @@ LOAD_OP2(mul, CPU, MALI_GPU);
 #endif
 #ifdef RELU_OP
 LOAD_OP2(relu, CPU, MALI_GPU);
+LOAD_OP1(relu6, CPU);
 #endif
 #ifdef IM2SEQUENCE_OP
 LOAD_OP1(im2sequence, CPU);
@@ -233,6 +234,10 @@ LOAD_OP1(quantize, CPU);
 #ifdef DEQUANT_OP
 LOAD_OP1(dequantize, CPU);
 #endif
+#ifdef FUSION_DEQUANT_BN_OP
+LOAD_OP1(fusion_dequant_bn, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_bn);
+#endif
 #ifdef FUSION_DEQUANT_ADD_BN_OP
 LOAD_OP1(fusion_dequant_add_bn, CPU);
 LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
@@ -245,3 +250,11 @@ LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
 LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
 LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
 #endif
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+LOAD_OP1(fusion_dequant_add_bn_quant, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant);
+#endif
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant);
+#endif
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -98,24 +98,6 @@ class OpRegistry {
  }
 };
-#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
-  template class op_class<device_type, int8_t>;                             \
-  template <typename Dtype, typename T>                                     \
-  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {    \
-   public:                                                                  \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);    \
-  };                                                                        \
-  static paddle_mobile::framework::OperatorRegistrar<                       \
-      device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
-      __op_registrar_##op_type##_##device_name(#op_type);                   \
-  int TouchOpRegistrar_##op_type##_##device_name() {                        \
-    __op_registrar_##op_type##_##device_name.Touch();                       \
-    return 0;                                                               \
-  }
-#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
-  REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
 #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
  template class op_class<device_type, float>;                             \
  template <typename Dtype, typename T>                                    \

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -220,7 +220,16 @@ void Node::Folder(
    }
  } else {
    for (auto &op_output : this->op_desc_->outputs_) {
-      op_desc->outputs_.emplace(op_output.first, op_output.second);
+      auto output_key = op_output.first;
+      if (change->find(this->type_) != change->end()) {
+        const auto change_pairs = (*change)[this->type_];
+        for (const auto &target : change_pairs) {
+          if (target.first == output_key) {
+            output_key = target.second;
+          }
+        }
+      }
+      op_desc->outputs_.emplace(output_key, op_output.second);
    }
    for (auto &output : this->outputs_) {

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -143,12 +143,10 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
  int t1 = 1;
  int t2 = 1;
  for (int i = 0; i < m * k; ++i) {
-    unsigned int seed = 100;
+    a[i] = t1 + rand() % t2;  // NOLINT
-    a[i] = t1 + rand_r(&seed) % t2;
  }
  for (int i = 0; i < k * n; ++i) {
-    unsigned int seed = 200;
+    b[i] = t1 + rand() % t2;  // NOLINT
-    b[i] = t1 + rand_r(&seed) % t2;
  }
  paddle_mobile::operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();

--- a/src/operators/fusion_conv_add_relu_int8_op.cpp
+++ b/src/operators/fusion_conv_add_relu_int8_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDRELU_INT8_OP
-#include "operators/fusion_conv_add_relu_int8_op.h"
-#include <vector>
-#include "operators/math/conv_func.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename Dtype, typename T>
-void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
-                           ops::FusionConvAddReluInt8Op);
-#endif
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/fusion_conv_add_relu_int8_op.h
+++ b/src/operators/fusion_conv_add_relu_int8_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDRELU_INT8_OP
-#pragma once
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class FusionConvAddReluInt8Op
-    : public framework::OperatorWithKernel<DeviceType,
-                                           FusionConvAddReluParam<DeviceType>,
-                                           ConvAddReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddReluInt8Op(const std::string &type,
-                          const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType,
-                                      FusionConvAddReluParam<DeviceType>,
-                                      ConvAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_add_bn_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }

--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_relu_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }
@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionDequantAddBNReluOp
    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+          DeviceType, FusionDequantAddBNParam<DeviceType>,
          operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
 public:
  FusionDequantAddBNReluOp(const std::string &type,
@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
                           const framework::AttributeMap &attrs,
                           std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
  // inference output shape

--- a/src/operators/fusion_fc_int8_op.cpp
+++ b/src/operators/fusion_fc_int8_op.cpp
@@ -12,50 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_FC_INT8_OP
+#include "operators/fusion_dequant_add_bn_relu_quant_op.h"
-#include "operators/fusion_fc_int8_op.h"
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
-void FusionFcInt8Op<Dtype, T>::InferShape() const {
+void FusionDequantAddBNReluQuantOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
+  const auto& input_dims = this->param_.input_->dims();
-  auto y_dims = this->param_.InputY()->dims();
+  this->param_.output_->Resize(input_dims);
-  int x_num_col_dims = this->param_.XNumColDims();
+}
-  int y_num_col_dims = this->param_.YNumColDims();
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-  assert(x_mat_dims[1] == y_mat_dims[0]);
+}  // namespace operators
+}  // namespace paddle_mobile
-  std::vector<int64_t> output_dims;
+namespace ops = paddle_mobile::operators;
-  output_dims.reserve(
+REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant,
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+                        ops::FusionDequantAddBNReluQuantMatcher);
-  for (int i = 0; i < x_num_col_dims; ++i) {
+#ifdef PADDLE_MOBILE_CPU
-    output_dims.push_back(x_dims[i]);
+REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant,
-  }
+                      ops::FusionDequantAddBNReluQuantOp);
+#endif
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-    output_dims.push_back(y_dims[i]);
+namespace paddle_mobile {
-  }
+namespace operators {
-  framework::DDim ddim = framework::make_ddim(output_dims);
+template <typename Dtype, typename T>
-  this->param_.Out()->Resize(ddim);
+void FusionDequantAddBNQuantOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant,
+                        ops::FusionDequantAddBNQuantMatcher);
 #ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
+REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant,
+                      ops::FusionDequantAddBNQuantOp);
 #endif
-#endif  // FUSION_FC_INT8_OP
+#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
--- a/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantAddBNReluQuantMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU) >
+        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; }
+};
+template <typename DeviceType, typename T>
+class FusionDequantAddBNReluQuantOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
+          operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>> {
+ public:
+  FusionDequantAddBNReluQuantOp(const std::string &type,
+                                const VariableNameMap &inputs,
+                                const VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs,
+                                std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
+            operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantAddBNQuantMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; }
+};
+template <typename DeviceType, typename T>
+class FusionDequantAddBNQuantOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
+          operators::FusionDequantAddBNQuantKernel<DeviceType, T>> {
+ public:
+  FusionDequantAddBNQuantOp(const std::string &type,
+                            const VariableNameMap &inputs,
+                            const VariableNameMap &outputs,
+                            const framework::AttributeMap &attrs,
+                            std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
+            operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/fusion_dequant_bn_relu_op.cpp
+++ b/src/operators/fusion_dequant_bn_relu_op.cpp
@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_DEQUANT_BN_RELU_OP
+#include "operators/fusion_dequant_bn_op.h"
-#include "operators/fusion_dequant_bn_relu_op.h"
 namespace paddle_mobile {
 namespace operators {
+#ifdef FUSION_DEQUANT_BN_OP
+template <typename Dtype, typename T>
+void FusionDequantBNOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
+}
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
 template <typename Dtype, typename T>
 void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
  const auto& input_dims = this->param_.input_->dims();
  this->param_.output_->Resize(input_dims);
 }
+#endif  // FUSION_DEQUANT_BN_RELU_OP
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+#ifdef FUSION_DEQUANT_BN_OP
+REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
+#endif  // PADDLE_MOBILE_CPU
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
 REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
                        ops::FusionDequantBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
-#endif
+#endif  // PADDLE_MOBILE_CPU
+#endif  // FUSION_DEQUANT_BN_RELU_OP
-#endif
--- a/src/operators/fusion_dequant_bn_op.h
+++ b/src/operators/fusion_dequant_bn_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
+class FusionDequantBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+  virtual void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+  std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
+};
+#endif  // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
+#ifdef FUSION_DEQUANT_BN_OP
+template <typename DeviceType, typename T>
+class FusionDequantBNOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionDequantBNParam<DeviceType>,
+                              operators::FusionDequantBNKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNParam<DeviceType>,
+            operators::FusionDequantBNKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
+ public:
+  FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionDequantBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantBNParam<DeviceType>,
+          operators::FusionDequantBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNParam<DeviceType>,
+            operators::FusionDequantBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_BN_RELU_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }

--- a/src/operators/fusion_fc_int8_op.h
+++ b/src/operators/fusion_fc_int8_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_INT8_OP
-#pragma once
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class FusionFcInt8Op
-    : public framework::OperatorWithKernel<DeviceType,
-                                           FusionFcParam<DeviceType>,
-                                           FusionFcKernel<DeviceType, T>> {
- public:
-  FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
-                                      FusionFcKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_FC_INT8_OP
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -32,20 +32,6 @@ void ConvAddReluKernel<CPU, float>::Compute(
 }
 template class ConvAddReluKernel<CPU, float>;
-#ifdef FUSION_CONVADDRELU_INT8_OP
-template <>
-bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
-  return true;
-}
-template <>
-void ConvAddReluKernel<CPU, int8_t>::Compute(
-    const FusionConvAddReluParam<CPU> &param) {
-  ConvAddReluCompute<int8_t, int32_t>(param);
-}
-template class ConvAddReluKernel<CPU, int8_t>;
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/dequant_add_bn_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_add_bn_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-#include "operators/kernel/dequant_add_bn_kernel.h"
-#include <cmath>
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool FusionDequantAddBNKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  // elementwise add params
-  const Tensor *bias = param->bias_;
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-  const float *bias_ptr = bias->data<float>();
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
-  }
-  return true;
-}
-template <>
-void FusionDequantAddBNKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  const int32_t *input = param.input_->data<int32_t>();
-  const float *bn_scale = param.bn_scale_->data<float>();
-  const float *bn_bias = param.bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param.activation_scale_->data<float>()[0];
-  const float weight_scale = param.weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-  float *output = param.output_->mutable_data<float>();
-  int batch_size = param.input_->dims()[0];
-  int channels = param.input_->dims()[1];
-  size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      float scale = bn_scale[c] * dequant_scale;
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = scale * x[k] + bias;
-      }
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_DEQUANT_ADD_BN_OP
--- a/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/dequant_bn_relu_kernel.h"
-#include <cmath>
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-namespace paddle_mobile {
-namespace operators {
-#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
-void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-  float *output = param->output_->mutable_data<float>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      float scale = bn_scale[c] * dequant_scale;
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      float32x4_t __zero = vdupq_n_f32(0.f);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        f0 = vmaxq_f32(__zero, f0);
-        f1 = vmaxq_f32(__zero, f1);
-        f2 = vmaxq_f32(__zero, f2);
-        f3 = vmaxq_f32(__zero, f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = std::max(scale * x[k] + bias, 0.f);
-      }
-    }
-  }
-}
-#endif
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <>
-bool FusionDequantBNReluKernel<CPU, float>::Init(
-    FusionDequantBNReluParam<CPU> *param) {
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = bn_bias_ptr[c] - inv_scale * mean_ptr[c];
-  }
-  return true;
-}
-template <>
-void FusionDequantBNReluKernel<CPU, float>::Compute(
-    const FusionDequantBNReluParam<CPU> &param) {
-  DequantBNReluCompute(&param);
-}
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <>
-bool FusionDequantAddBNReluKernel<CPU, float>::Init(
-    FusionDequantAddBNReluParam<CPU> *param) {
-  // elementwise add params
-  const Tensor *bias = param->bias_;
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-  const float *bias_ptr = bias->data<float>();
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
-  }
-  return true;
-}
-template <>
-void FusionDequantAddBNReluKernel<CPU, float>::Compute(
-    const FusionDequantAddBNReluParam<CPU> &param) {
-  DequantBNReluCompute(&param);
-}
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
-}  // namespace operators
-}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cmath>
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/math/activation.h"
+#include "operators/math/quantize.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+namespace paddle_mobile {
+namespace operators {
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
+    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
+                                    const framework::Tensor *bias) {
+  // batch norm params
+  const Tensor *bn_mean = param->bn_mean_;
+  const Tensor *bn_variance = param->bn_variance_;
+  Tensor *bn_scale = param->bn_scale_;
+  Tensor *bn_bias = param->bn_bias_;
+  const float epsilon = param->epsilon_;
+  const float *mean_ptr = bn_mean->data<float>();
+  const float *var_ptr = bn_variance->data<float>();
+  float *bn_scale_ptr = bn_scale->mutable_data<float>();
+  float *bn_bias_ptr = bn_bias->mutable_data<float>();
+  for (int c = 0; c < bn_scale->numel(); ++c) {
+    float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
+    float val = bias ? bias->data<float>()[c] : 0;
+    bn_bias_ptr[c] =
+        inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
+    bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
+  }
+}
+#endif
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
+template <ActivationType Act>
+void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
+  const int32_t *input = param->input_->data<int32_t>();
+  const float *bn_scale = param->bn_scale_->data<float>();
+  const float *bn_bias = param->bn_bias_->data<float>();
+  // dequantize params
+  const float activation_scale = param->activation_scale_->data<float>()[0];
+  const float weight_scale = param->weight_scale_;
+  const float dequant_scale = activation_scale / weight_scale;
+  float *output = param->output_->mutable_data<float>();
+  int batch_size = param->input_->dims()[0];
+  int channels = param->input_->dims()[1];
+  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int c = 0; c < channels; ++c) {
+      // not fuse bn and dequant scale to minimize precision difference
+      // float scale = bn_scale[c] * dequant_scale;
+      float scale = bn_scale[c];
+      float bias = bn_bias[c];
+      size_t offset = (batch * channels + c) * spatial_size;
+      const int32_t *x = input + offset;
+      float *y = output + offset;
+      size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      int loop = spatial_size >> 4;
+      remain = spatial_size & 0xF;
+      float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
+      float32x4_t __scale = vdupq_n_f32(scale);
+      float32x4_t __bias = vdupq_n_f32(bias);
+      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+        int32x4_t r0 = vld1q_s32(x);
+        int32x4_t r1 = vld1q_s32(x + 4);
+        int32x4_t r2 = vld1q_s32(x + 8);
+        int32x4_t r3 = vld1q_s32(x + 12);
+        float32x4_t f0 = vcvtq_f32_s32(r0);
+        float32x4_t f1 = vcvtq_f32_s32(r1);
+        float32x4_t f2 = vcvtq_f32_s32(r2);
+        float32x4_t f3 = vcvtq_f32_s32(r3);
+        f0 = vmulq_f32(__dequant_scale, f0);
+        f1 = vmulq_f32(__dequant_scale, f1);
+        f2 = vmulq_f32(__dequant_scale, f2);
+        f3 = vmulq_f32(__dequant_scale, f3);
+        f0 = vmlaq_f32(__bias, __scale, f0);
+        f1 = vmlaq_f32(__bias, __scale, f1);
+        f2 = vmlaq_f32(__bias, __scale, f2);
+        f3 = vmlaq_f32(__bias, __scale, f3);
+        f0 = math::vActiveq_f32<Act>(f0);
+        f1 = math::vActiveq_f32<Act>(f1);
+        f2 = math::vActiveq_f32<Act>(f2);
+        f3 = math::vActiveq_f32<Act>(f3);
+        vst1q_f32(y, f0);
+        vst1q_f32(y + 4, f1);
+        vst1q_f32(y + 8, f2);
+        vst1q_f32(y + 12, f3);
+      }
+#endif  // __ARM_NEON__
+      for (int k = 0; k < remain; ++k) {
+        y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
+      }
+    }
+  }
+}
+#endif
+#ifdef FUSION_DEQUANT_BN_OP
+template <>
+bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
+  PublicFusionDequantBNInitParam(param, nullptr);
+  return true;
+}
+template <>
+void FusionDequantBNKernel<CPU, float>::Compute(
+    const FusionDequantBNParam<CPU> &param) {
+  DequantBNCompute<IDENTITY>(&param);
+}
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+template <>
+bool FusionDequantBNReluKernel<CPU, float>::Init(
+    FusionDequantBNParam<CPU> *param) {
+  PublicFusionDequantBNInitParam(param, nullptr);
+  return true;
+}
+template <>
+void FusionDequantBNReluKernel<CPU, float>::Compute(
+    const FusionDequantBNParam<CPU> &param) {
+  DequantBNCompute<RELU>(&param);
+}
+#endif  // FUSION_DEQUANT_BN_RELU_OP
+#ifdef FUSION_DEQUANT_ADD_BN_OP
+template <>
+bool FusionDequantAddBNKernel<CPU, float>::Init(
+    FusionDequantAddBNParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+template <>
+void FusionDequantAddBNKernel<CPU, float>::Compute(
+    const FusionDequantAddBNParam<CPU> &param) {
+  DequantBNCompute<IDENTITY>(&param);
+}
+#endif  // FUSION_DEQUANT_ADD_BN_OP
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+template <>
+bool FusionDequantAddBNReluKernel<CPU, float>::Init(
+    FusionDequantAddBNParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+template <>
+void FusionDequantAddBNReluKernel<CPU, float>::Compute(
+    const FusionDequantAddBNParam<CPU> &param) {
+  DequantBNCompute<RELU>(&param);
+}
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
+#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+template <Activation Act, RoundType R>
+void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
+  const int32_t *input = param->input_->data<int32_t>();
+  const float *bn_scale = param->bn_scale_->data<float>();
+  const float *bn_bias = param->bn_bias_->data<float>();
+  // dequantize params
+  const float activation_scale = param->activation_scale_->data<float>()[0];
+  const float weight_scale = param->weight_scale_;
+  const float dequant_scale = activation_scale / weight_scale;
+  // quantize params
+  Tensor *output_scale = param->online_scale_;
+  float max_abs = 0.f;
+  int8_t *output = param->output_->mutable_data<int8_t>();
+  int batch_size = param->input_->dims()[0];
+  int channels = param->input_->dims()[1];
+  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
+  //  if (param->is_static_) {
+  if (true) {
+    max_abs = param->static_scale_;
+    float quant_scale = 127.f / max_abs;
+    #pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < batch_size; ++batch) {
+      for (int c = 0; c < channels; ++c) {
+        // not fuse bn and dequant scale to minimize precision difference
+        // float scale = bn_scale[c] * dequant_scale;
+        float scale = bn_scale[c];
+        float bias = bn_bias[c];
+        size_t offset = (batch * channels + c) * spatial_size;
+        const int32_t *x = input + offset;
+        int8_t *y = output + offset;
+        size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+        int loop = spatial_size >> 4;
+        remain = spatial_size & 0xF;
+        float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
+        float32x4_t __scale = vdupq_n_f32(scale);
+        float32x4_t __bias = vdupq_n_f32(bias);
+        float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
+        for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+          int32x4_t r0 = vld1q_s32(x);
+          int32x4_t r1 = vld1q_s32(x + 4);
+          int32x4_t r2 = vld1q_s32(x + 8);
+          int32x4_t r3 = vld1q_s32(x + 12);
+          float32x4_t f0 = vcvtq_f32_s32(r0);
+          float32x4_t f1 = vcvtq_f32_s32(r1);
+          float32x4_t f2 = vcvtq_f32_s32(r2);
+          float32x4_t f3 = vcvtq_f32_s32(r3);
+          f0 = vmulq_f32(__dequant_scale, f0);
+          f1 = vmulq_f32(__dequant_scale, f1);
+          f2 = vmulq_f32(__dequant_scale, f2);
+          f3 = vmulq_f32(__dequant_scale, f3);
+          f0 = vmlaq_f32(__bias, __scale, f0);
+          f1 = vmlaq_f32(__bias, __scale, f1);
+          f2 = vmlaq_f32(__bias, __scale, f2);
+          f3 = vmlaq_f32(__bias, __scale, f3);
+          f0 = math::vActiveq_f32<Act>(f0);
+          f1 = math::vActiveq_f32<Act>(f1);
+          f2 = math::vActiveq_f32<Act>(f2);
+          f3 = math::vActiveq_f32<Act>(f3);
+          f0 = vmulq_f32(__quant_scale, f0);
+          f1 = vmulq_f32(__quant_scale, f1);
+          f2 = vmulq_f32(__quant_scale, f2);
+          f3 = vmulq_f32(__quant_scale, f3);
+          int32x4_t q0 = math::vRoundq_f32<R>(f0);
+          int32x4_t q1 = math::vRoundq_f32<R>(f1);
+          int32x4_t q2 = math::vRoundq_f32<R>(f2);
+          int32x4_t q3 = math::vRoundq_f32<R>(f3);
+          int16x4_t d0 = vmovn_s32(q0);
+          int16x4_t d1 = vmovn_s32(q1);
+          int16x4_t d2 = vmovn_s32(q2);
+          int16x4_t d3 = vmovn_s32(q3);
+          int16x8_t q5 = vcombine_s16(d0, d1);
+          int16x8_t q6 = vcombine_s16(d2, d3);
+          int8x8_t d5 = vmovn_s16(q5);
+          int8x8_t d6 = vmovn_s16(q6);
+          vst1_s8(y, d5);
+          vst1_s8(y + 8, d6);
+        }
+#endif  // __ARM_NEON__
+        for (int k = 0; k < remain; ++k) {
+          float x_temp =
+              math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
+          y[k] = math::Round<R>(x_temp * quant_scale);
+        }
+      }
+    }
+  } else {
+    // TODO(hjchen2)
+    max_abs = std::max(max_abs, 1e-6f);
+  }
+  param->online_scale_->mutable_data<float>()[0] = max_abs;
+}
+template <>
+bool FusionDequantAddBNQuantKernel<CPU, float>::Init(
+    FusionDequantAddBNQuantParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+template <>
+void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
+    const FusionDequantAddBNQuantParam<CPU> &param) {
+  switch (param.round_type_) {
+    case ROUND_NEAREST_TO_EVEN:
+      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TO_EVEN>(&param);
+      break;
+    case ROUND_NEAREST_TOWARDS_ZERO:
+      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TOWARDS_ZERO>(&param);
+      break;
+    case ROUND_NEAREST_AWAY_ZERO:
+      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_AWAY_ZERO>(&param);
+      break;
+    default:
+      LOG(kLOG_ERROR) << "round type is not supported.";
+      break;
+  }
+}
+#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+template <>
+bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
+    FusionDequantAddBNQuantParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+template <>
+void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
+    const FusionDequantAddBNQuantParam<CPU> &param) {
+  switch (param.round_type_) {
+    case ROUND_NEAREST_TO_EVEN:
+      DequantBNQuantCompute<RELU, ROUND_NEAREST_TO_EVEN>(&param);
+      break;
+    case ROUND_NEAREST_TOWARDS_ZERO:
+      DequantBNQuantCompute<RELU, ROUND_NEAREST_TOWARDS_ZERO>(&param);
+      break;
+    case ROUND_NEAREST_AWAY_ZERO:
+      DequantBNQuantCompute<RELU, ROUND_NEAREST_AWAY_ZERO>(&param);
+      break;
+    default:
+      LOG(kLOG_ERROR) << "round type is not supported.";
+      break;
+  }
+}
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "operators/kernel/quantize_kernel.h"
 #include <cmath>
+#include "operators/math/quantize.h"
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
@@ -32,81 +33,68 @@ inline float32_t vmaxvq_f32(float32x4_t r) {
 }
 #endif
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
+template <RoundType R>
-inline int32x4_t vround_f32(float32x4_t r) {
+inline void QuantizeOffline(const Tensor *input, const float scale,
-  return vcvtq_s32_f32(r);
+                            const float max_abs, Tensor *output) {
-}
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
-template <>
+  size_t remain = input->numel();
-inline int32x4_t vround_f32<ROUND_NEAREST_AWAY_ZERO>(float32x4_t r) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  float32x4_t plus = vdupq_n_f32(0.5);
+  size_t loop = remain >> 4;
-  float32x4_t minus = vdupq_n_f32(-0.5);
+  remain = remain & 0xF;
-  float32x4_t zero = vdupq_n_f32(0);
+  float32x4_t __scale = vdupq_n_f32(scale);
-  uint32x4_t more_than_zero = vcgtq_f32(r, zero);
+  float32x4_t __postive_max = vdupq_n_f32(max_abs);
-  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
+  float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
-  temp = vaddq_f32(r, temp);
+  #pragma omp parallel for
-  int32x4_t ret = vcvtq_s32_f32(temp);
+  for (size_t i = 0; i < loop; ++i) {
-  return ret;
+    const float *local_x = x + (i << 4);
-}
+    int8_t *local_y = y + (i << 4);
+    float32x4_t r0 = vld1q_f32(local_x);
-template <>
+    float32x4_t r1 = vld1q_f32(local_x + 4);
-inline int32x4_t vround_f32<ROUND_NEAREST_TO_EVEN>(float32x4_t r) {
+    float32x4_t r2 = vld1q_f32(local_x + 8);
-  float32x4_t point5 = vdupq_n_f32(0.5);
+    float32x4_t r3 = vld1q_f32(local_x + 12);
-  int32x4_t one = vdupq_n_s32(1);
+    r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
-  int32x4_t zero = vdupq_n_s32(0);
+    r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
+    r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
-  int32x4_t rnd = vround_f32<ROUND_NEAREST_AWAY_ZERO>(r);
+    r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
-  float32x4_t frnd = vcvtq_f32_s32(rnd);
+    r0 = vmulq_f32(r0, __scale);
-  frnd = vsubq_f32(frnd, r);
+    r1 = vmulq_f32(r1, __scale);
-  frnd = vabsq_f32(frnd);
+    r2 = vmulq_f32(r2, __scale);
-  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
+    r3 = vmulq_f32(r3, __scale);
-  int32x4_t abs_rnd = vabsq_s32(rnd);
+    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-  abs_rnd = vandq_s32(abs_rnd, one);
+    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
+    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
+    int32x4_t q3 = math::vRoundq_f32<R>(r3);
-  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
+    int16x4_t d0 = vmovn_s32(q0);
-  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+    int16x4_t d1 = vmovn_s32(q1);
-  mask = veorq_u32(more_than_zero, mask);
+    int16x4_t d2 = vmovn_s32(q2);
-  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+    int16x4_t d3 = vmovn_s32(q3);
-  mask = vaddq_u32(more_than_zero, mask);
+    int16x8_t q5 = vcombine_s16(d0, d1);
-  int32x4_t smask = vreinterpretq_s32_u32(mask);
+    int16x8_t q6 = vcombine_s16(d2, d3);
-  smask = vsubq_s32(smask, one);
+    int8x8_t d5 = vmovn_s16(q5);
-  rnd = vaddq_s32(rnd, smask);
+    int8x8_t d6 = vmovn_s16(q6);
-  return rnd;
+    vst1_s8(local_y, d5);
-}
+    vst1_s8(local_y + 8, d6);
+  }
+  x += (loop << 4);
+  y += (loop << 4);
 #endif
+  for (size_t i = 0; i < remain; ++i) {
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
+    float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
-inline int8_t Round(const float &x) {
+    y[i] = math::Round<R>(x_temp * scale);
-  return static_cast<int8_t>(x);
-}
-template <>
-inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
-  return std::round(x);
-}
-template <>
-inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
-  float v = std::round(x);
-  int32_t q = static_cast<int32_t>(v);
-  if (std::abs(std::abs(q - v) - 0.5) <= 0) {
-    if (std::abs(q) % 2 != 0) {
-      q = q + ((q > 0) ? -1 : 1);
-    }
  }
-  return static_cast<int8_t>(q);
 }
 template <RoundType R>
-static void Quantize(const Tensor *input, const float scale, Tensor *output) {
+inline void QuantizeOnline(const Tensor *input, const float scale,
+                           Tensor *output) {
  const float *x = input->data<const float>();
  int8_t *y = output->mutable_data<int8_t>();
  size_t remain = input->numel();
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  size_t loop = remain >> 4;
  remain = remain & 0xF;
+  float32x4_t __scale = vdupq_n_f32(scale);
  #pragma omp parallel for
  for (size_t i = 0; i < loop; ++i) {
    const float *local_x = x + (i << 4);
@@ -115,14 +103,14 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
    float32x4_t r1 = vld1q_f32(local_x + 4);
    float32x4_t r2 = vld1q_f32(local_x + 8);
    float32x4_t r3 = vld1q_f32(local_x + 12);
-    r0 = vmulq_n_f32(r0, scale);
+    r0 = vmulq_f32(r0, __scale);
-    r1 = vmulq_n_f32(r1, scale);
+    r1 = vmulq_f32(r1, __scale);
-    r2 = vmulq_n_f32(r2, scale);
+    r2 = vmulq_f32(r2, __scale);
-    r3 = vmulq_n_f32(r3, scale);
+    r3 = vmulq_f32(r3, __scale);
-    int32x4_t q0 = vround_f32<R>(r0);
+    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-    int32x4_t q1 = vround_f32<R>(r1);
+    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-    int32x4_t q2 = vround_f32<R>(r2);
+    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-    int32x4_t q3 = vround_f32<R>(r3);
+    int32x4_t q3 = math::vRoundq_f32<R>(r3);
    int16x4_t d0 = vmovn_s32(q0);
    int16x4_t d1 = vmovn_s32(q1);
    int16x4_t d2 = vmovn_s32(q2);
@@ -138,7 +126,18 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
  y += (loop << 4);
 #endif
  for (size_t i = 0; i < remain; ++i) {
-    y[i] = Round<R>(x[i] * scale);
+    y[i] = math::Round<R>(x[i] * scale);
+  }
+}
+template <RoundType R>
+static void Quantize(const Tensor *input, const float max_abs,
+                     const bool offline, Tensor *output) {
+  float scale = 127.f / max_abs;
+  if (offline) {
+    QuantizeOffline<R>(input, scale, max_abs, output);
+  } else {
+    QuantizeOnline<R>(input, scale, output);
  }
 }
@@ -173,6 +172,13 @@ float find_abs_max(const Tensor *input) {
  return max_abs;
 }
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // __ARM_NEON__
+namespace paddle_mobile {
+namespace operators {
 template <>
 bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
  return true;
@@ -184,24 +190,23 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
  Tensor *output = param.output_;
  Tensor *output_scale = param.online_scale_;
  float max_abs = 0.f;
-  if (param.is_static_) {
+  if (param.offline_) {
-    max_abs = param.static_scale_;
+    max_abs = param.offline_scale_->data<float>()[0];
  } else {
    max_abs = find_abs_max(input);
  }
  max_abs = std::max(max_abs, 1e-6f);
-  // only support int8 currently
-  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
-      Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output);
+      Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
-      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output);
+      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
+                                           output);
      break;
    case ROUND_NEAREST_AWAY_ZERO:
-      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output);
+      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
      break;
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";
@@ -212,4 +217,4 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
 }  // namespace operators
 }  // namespace paddle_mobile
-#endif
+#endif  // QUANT_OP
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -15,11 +15,56 @@ limitations under the License. */
 #ifdef RELU_OP
 #include "operators/kernel/relu_kernel.h"
-#include "operators/kernel/central-arm-func/relu_arm_func.h"
+#include "common/types.h"
+#include "operators/math/activation.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 namespace paddle_mobile {
 namespace operators {
+template <typename Dtype, ActivationType Act>
+struct ReluCompute {
+  void operator()(const Tensor *input, Tensor *output) {}
+};
+template <ActivationType Act>
+struct ReluCompute<float, Act> {
+  void operator()(const Tensor *input, Tensor *output) {
+    const float *x = input->data<float>();
+    float *y = output->mutable_data<float>();
+    size_t remain = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    size_t loop = remain >> 4;
+    remain = remain & 0xF;
+    #pragma omp parallel for
+    for (size_t i = 0; i < loop; ++i) {
+      const float *local_x = x + (i << 4);
+      float *local_y = y + (i << 4);
+      float32x4_t r0 = vld1q_f32(local_x);
+      float32x4_t r1 = vld1q_f32(local_x + 4);
+      float32x4_t r2 = vld1q_f32(local_x + 8);
+      float32x4_t r3 = vld1q_f32(local_x + 12);
+      r0 = math::vActiveq_f32<Act>(r0);
+      r1 = math::vActiveq_f32<Act>(r1);
+      r2 = math::vActiveq_f32<Act>(r2);
+      r3 = math::vActiveq_f32<Act>(r3);
+      vst1q_f32(local_y, r0);
+      vst1q_f32(local_y + 4, r1);
+      vst1q_f32(local_y + 8, r2);
+      vst1q_f32(local_y + 12, r3);
+    }
+    x += (loop << 4);
+    y += (loop << 4);
+#endif
+    for (size_t i = 0; i < remain; ++i) {
+      y[i] = math::Active<Act>(x[i]);
+    }
+  }
+};
 template <>
 bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
  return true;
@@ -27,7 +72,21 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
-  ReluCompute<float>(param);
+  const Tensor *input = param.InputX();
+  Tensor *output = param.Out();
+  ReluCompute<float, RELU>()(input, output);
+}
+template <>
+bool Relu6Kernel<CPU, float>::Init(ReluParam<CPU> *param) {
+  return true;
+}
+template <>
+void Relu6Kernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
+  const Tensor *input = param.InputX();
+  Tensor *output = param.Out();
+  ReluCompute<float, RELU6>()(input, output);
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/transpose2_kernel.cpp
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
@@ -11,14 +11,111 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef TRANSPOSE2_OP
 #include "operators/kernel/transpose2_kernel.h"
-#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
+bool IsShuffleChannel(const std::vector<int> &axis) {
+  bool is_shuffle_channel = true;
+  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+    for (int i = 3; i < axis.size(); ++i) {
+      if (axis[i] != i) {
+        is_shuffle_channel = false;
+        break;
+      }
+    }
+  } else {
+    return false;
+  }
+  return is_shuffle_channel;
+}
+template <typename Dtype>
+void ShuffleChannelCompute(const Transpose2Param<CPU> &param) {
+  const std::vector<int> &axis = param.Axis();
+  const Tensor *input = param.InputX();
+  const Dtype *input_ptr = input->data<Dtype>();
+  Tensor *output = param.Out();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+  // input and output's shape dimension must >= 2 && <= 6.
+  const framework::DDim &in_dim = input->dims();
+  const framework::DDim &out_dim = output->dims();
+  size_t offset = 1;
+  for (int i = 3; i < axis.size(); ++i) {
+    offset *= in_dim[i];
+  }
+  #pragma omp parallel for collapse(3)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+        size_t out_offset =
+            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+        memcpy(output_ptr + out_offset, input_ptr + in_offset,
+               offset * sizeof(Dtype));
+      }
+    }
+  }
+}
+template <typename Dtype>
+void Transpose2Compute(const Transpose2Param<CPU> &param) {
+  const std::vector<int> &axis = param.Axis();
+  const Tensor *input = param.InputX();
+  const Dtype *input_ptr = input->data<Dtype>();
+  Tensor *output = param.Out();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+  // input and output's shape dimension must >= 2 && <= 6.
+  const framework::DDim &in_dim = input->dims();
+  const framework::DDim &out_dim = output->dims();
+  // precompute inverted output dim and strides
+  size_t rout_dim[6], strides[6];
+  int permute = axis.size();  // permute must >=2 && <= 6.
+  for (int i = 0; i < permute; ++i) {
+    int k = permute - 1 - i;
+    strides[k] = 1;
+    for (int j = axis[i] + 1; j < permute; ++j) {
+      strides[k] *= in_dim[j];
+    }
+    rout_dim[k] = out_dim[i];
+  }
+  // unroll the first 2 dimensions
+  int reamin_dim = 1;
+  for (int i = 2; i < out_dim.size(); ++i) {
+    reamin_dim *= out_dim[i];
+  }
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int j = 0; j < out_dim[1]; ++j) {
+      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+      int indics[4] = {0, 0, 0, 0};
+      for (int k = 0; k < reamin_dim; ++k) {
+        out_ptr[k] = input_ptr[offset];
+        indics[0] += 1;
+        offset += strides[0];
+        for (int p = 0; p < permute - 3; ++p) {
+          if (indics[p] == rout_dim[p]) {
+            indics[p + 1] += 1;
+            indics[p] = 0;
+            offset += strides[p + 1];
+            offset -= rout_dim[p] * strides[p];
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
 template <>
 bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
  return true;
@@ -26,10 +123,24 @@ bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
 template <>
 void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
-  Transpose2Compute<float>(param);
+  const std::vector<int> &axis = param.Axis();
+  bool shuffle_channel = IsShuffleChannel(axis);
+  if (shuffle_channel) {
+    if (param.InputX()->type() == typeid(int8_t)) {
+      ShuffleChannelCompute<int8_t>(param);
+    } else {
+      ShuffleChannelCompute<float>(param);
+    }
+  } else {
+    if (param.InputX()->type() == typeid(int8_t)) {
+      Transpose2Compute<int8_t>(param);
+    } else {
+      Transpose2Compute<float>(param);
+    }
+  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
-#endif
+#endif  // TRANSPOSE2_OP
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -25,6 +25,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
@@ -106,9 +107,9 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
+      math::matmul<float, float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
+                                 static_cast<float>(1), &out_slice,
-                          static_cast<float>(1), false, biase_data);
+                                 static_cast<float>(1), false, biase_data);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -25,24 +25,18 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-template <typename P, typename S>
+template <typename Itype, typename Otype>
 void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
  int32_t axis = param.Axis();
-  S *bias_data = bias.data<S>();
+  Otype *bias_data = bias.data<Otype>();
  Tensor *output = param.Output();
-  output->mutable_data<P>();
+  output->mutable_data<Otype>();
  float alpha = 1.0f;
  float beta = 1.0f;
-#ifdef FUSION_CONVADDRELU_INT8_OP
-  alpha = param.InputScale()->data<float>()[0];
-  beta = 0.0f;
-#endif
  int32_t groups = param.Groups();
  std::vector<int32_t> strides = param.Strides();
  std::vector<int32_t> paddings = param.Paddings();
@@ -70,7 +64,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<P>(col_shape);
+    col.mutable_data<Itype>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
@@ -89,8 +83,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, P> vol2col;
+  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
  for (int32_t i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -118,8 +112,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
+      math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
-                   beta, true, bias_data);
+                                 &out_slice, beta, true, bias_data);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -106,9 +106,10 @@ inline void GemmConv(const ConvParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul(filter_slice, false, col_matrix, false,
+      math::matmul<Itype, Otype>(filter_slice, false, col_matrix, false,
-                   static_cast<float>(1), &out_slice, static_cast<float>(0),
+                                 static_cast<float>(1), &out_slice,
-                   false, static_cast<Otype *>(nullptr));
+                                 static_cast<float>(0), false,
+                                 static_cast<Otype *>(nullptr));
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
@@ -93,8 +93,8 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
      Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
      Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul(filter_slice, true, in_slice, false, static_cast<P>(1.0),
+      math::matmul<P, P>(filter_slice, true, in_slice, false,
-                   &col_matrix, static_cast<P>(0.0));
+                         static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
      if (data_dim == 2U) {
        col2im(col, dilations, strides,
               std::vector<int>{paddings[0], paddings[1], paddings[0],

--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -23,20 +23,15 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-template <typename P, typename S>
+template <typename Itype, typename Otype>
 void FusionFcCompute(const FusionFcParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *input_z = param.InputZ();
-  S *input_z_data = input_z->data<S>();
+  Otype *input_z_data = input_z->data<Otype>();
  int axis = param.Axis();
  Tensor *out = param.Out();
-  //  int m = out->dims()[0];
+  auto *out_data = out->mutable_data<Itype>();
-  //  int n = out->dims()[1];
-  auto *out_data = out->mutable_data<P>();
-  float alpha = 1.0f;
-  float beta = 1.0f;
  const Tensor x_matrix =
      input_x->dims().size() > 2
@@ -57,28 +52,14 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
-  if (std::is_same<P, int8_t>::value) {
+  // bias_data的维度和out的第二个维度一致
-#ifdef FUSION_FC_INT8_OP
+  int64_t classes = input_z->numel();
-    alpha = param.InputScale()->data<float>()[0];
+  for (int i = 0; i < out_dim[0]; i++) {
-    beta = 0.0f;
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
-    math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
-                 input_z_data, true);
-#endif
-  } else {
-    // bias_data的维度和out的第二个维度一致
-    int64_t classes = input_z->numel();
-    for (int i = 0; i < out_dim[0]; i++) {
-      memory::Copy(out_data + i * classes, input_z_data,
-                   sizeof(float) * classes);
-    }
-    math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
-                        false);
  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  math::matmul<Itype, Otype>(x_matrix, false, y_matrix, false,
-  //  if (out_dim.size() != 2) {
+                             static_cast<float>(1), out, static_cast<float>(1),
-  //      out->Resize(out_dim);
+                             false);
-  //  }
 }
 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -73,14 +73,14 @@ void MulCompute(const MulParam<CPU> &param) {
  }
  if (param.InputX()->type() == typeid(int8_t)) {
    out->mutable_data<int32_t>();
-    math::matmul<float, int32_t>(x_matrix, false, y_matrix, false,
+    math::matmul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
-                                 static_cast<float>(1), out,
+                                  static_cast<float>(1), out,
-                                 static_cast<float>(0));
+                                  static_cast<float>(0));
  } else {
    out->mutable_data<float>();
-    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+    math::matmul<float, float>(x_matrix, false, y_matrix, false,
-                        out, static_cast<float>(0));
+                               static_cast<float>(1), out,
+                               static_cast<float>(0));
  }
  if (out_dim.size() != 2) {
    out->Resize(out_dim);

--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -17,103 +17,53 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include "common/types.h"
 #include "operators/math/pooling.h"
 namespace paddle_mobile {
 namespace operators {
-using framework::Tensor;
-template <typename T, typename S>
-void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-               std::vector<int> strides, std::vector<int> paddings,
-               const Tensor *in_x, Tensor *out) {
-  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
-    math::MaxPool<T> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
-    math::AvgPool<T, S> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  }
-}
 template <typename P>
 void PoolCompute(const PoolParam<CPU> &param) {
-  const Tensor *in_x = param.Input();
+  const framework::Tensor *input = param.Input();
-  Tensor *out = param.Output();
+  framework::Tensor *output = param.Output();
-  std::string pooling_type = param.PoolingType();
+  const std::string &pooling_type = param.PoolingType();
  std::vector<int> ksize = param.Ksize();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
-  if (ksize.size() != 2) {
-    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-        << "Pool op only supports 2D and 3D input.";
-  }
  if (param.isGlobalPooling()) {
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      ksize[i] = static_cast<int>(input->dims()[i + 2]);
    }
  }
-  if (in_x->type() == typeid(int8_t)) {
+  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max" && strides[0] == strides[1]) {
-      if (strides[0] == strides[1] && strides[0] == 1) {
+      if (strides[0] == 1) {
-        math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
+        math::Pooling3x3<MAX, 1>()(*input, paddings, output);
-      } else if (strides[0] == strides[1] && strides[0] == 2) {
+      } else if (strides[0] == 2) {
-        math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
+        math::Pooling3x3<MAX, 2>()(*input, paddings, output);
      } else {
-        math::Pool3x3Max_int8(strides, paddings, in_x, out);
+        math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
+      }
+    } else if (pooling_type == "avg" && strides[0] == strides[1]) {
+      if (strides[0] == 1) {
+        math::Pooling3x3<AVG, 1>()(*input, paddings, output);
+      } else if (strides[0] == 2) {
+        math::Pooling3x3<AVG, 2>()(*input, paddings, output);
+      } else {
+        math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
      }
    } else {
-      PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
+      // Others
-                                 out);
    }
  } else {
-    if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
-      if (pooling_type == "max") {
+      math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-        if (strides[0] == strides[1] && strides[0] == 1 &&
+    } else if (pooling_type == "avg") {
-            paddings[0] == paddings[1] && paddings[1] == 1) {
+      math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-          math::Pool3x3Maxs1p1(in_x, out);
-        } else {
-          math::Pool3x3Max(strides, paddings, in_x, out);
-        }
-      } else if (pooling_type == "avg") {
-        if (strides[0] == strides[1] && strides[0] == 1 &&
-            paddings[0] == paddings[1] && paddings[1] == 1) {
-          math::Pool3x3Avgs1p1(in_x, out);
-        } else {
-          math::Pool3x3Avg(strides, paddings, in_x, out);
-        }
-      }
-    } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
-               strides[0] == strides[1] && paddings[0] == paddings[1] &&
-               paddings[1] == 0) {
-#if __ARM_NEON
-#if __aarch64__
-      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
-                              out);
-#else
-      /// todo: fix bug in Pool2x2
-      if (pooling_type == "max") {
-        math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
-      } else if (pooling_type == "avg") {
-        math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
-      }
-#endif
-#else
-      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
-                              out);
-#endif  // __ARM_NEON
    } else {
-      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
+      // Others
-                              out);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RELU_OP
-#pragma once
-#include <operators/math/transform.h>
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-namespace paddle_mobile {
-namespace operators {
-template <typename T>
-struct ReluFunctor {
-  inline T operator()(T in) const { return in > 0 ? in : 0; }
-};
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
-template <typename P>
-void ReluCompute(const ReluParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-  int numel = input_x->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#if __aarch64__
-  if (numel > 0) {
-    int loop = numel >> 0x4;
-    int remain = numel & 0xF;
-    float32x4_t zero = vdupq_n_f32(0.f);
-    for (int i = 0; i < loop; ++i) {
-      float32x4_t r0 = vld1q_f32(input_x_ptr);
-      float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
-      float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
-      float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
-      r0 = vmaxq_f32(r0, zero);
-      r1 = vmaxq_f32(r1, zero);
-      r2 = vmaxq_f32(r2, zero);
-      r3 = vmaxq_f32(r3, zero);
-      vst1q_f32(out_ptr, r0);
-      vst1q_f32(out_ptr + 4, r1);
-      vst1q_f32(out_ptr + 8, r2);
-      vst1q_f32(out_ptr + 12, r3);
-      input_x_ptr += 16;
-      out_ptr += 16;
-    }
-    for (int i = 0; i < remain; ++i) {
-      out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
-    }
-#else
-  if (numel > 64) {
-    asm volatile(
-        "pld        [%[input_x_ptr], #0]        \n\t"
-        "vmov.f32   q8,    #0.0                 \n\t"
-        "subs %[num], %[num], #32                \n\t"
-        "blt        end_num_%=                  \n\t"
-        "loop_num_%=:                           \n\t"
-        "pld        [%[input_x_ptr], #1024]      \n\t"
-        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-        "vmax.f32 q0, q0, q8                   \n\t"
-        "vmax.f32 q1, q1, q8                    \n\t"
-        "vmax.f32 q2, q2, q8                   \n\t"
-        "vmax.f32 q3, q3, q8                   \n\t"
-        "vmax.f32 q4, q4, q8                   \n\t"
-        "vmax.f32 q5, q5, q8                   \n\t"
-        "vmax.f32 q6, q6, q8                   \n\t"
-        "vmax.f32 q7, q7, q8                   \n\t"
-        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-        "subs %[num], %[num], #32              \n\t"
-        "bge        loop_num_%=                \n\t"
-        "end_num_%=:                           \n\t"
-        "cmp %[num], #0                         \n\t"
-        "bge   end_%=                          \n\t"
-        "mov r6, #4                             \n\t"
-        "mul r5, %[num], r6                     \n\t"
-        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-        "vmax.f32 q0, q0, q8                   \n\t"
-        "vmax.f32 q1, q1, q8                    \n\t"
-        "vmax.f32 q2, q2, q8                   \n\t"
-        "vmax.f32 q3, q3, q8                   \n\t"
-        "vmax.f32 q4, q4, q8                   \n\t"
-        "vmax.f32 q5, q5, q8                   \n\t"
-        "vmax.f32 q6, q6, q8                   \n\t"
-        "vmax.f32 q7, q7, q8                   \n\t"
-        "add %[out_ptr], %[out_ptr], r5       \n\t"
-        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-        "end_%=:                                \n\t"
-        :
-        :
-        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
-          "r6");
-#endif
-  } else {
-#endif
-    ReluFunctor<float> func_;
-    math::Transform trans;
-    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  }
-#endif
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/central-arm-func/transpose2_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-#pragma once
-#include <vector>
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void Transpose2Compute(const Transpose2Param<CPU>& param) {
-  const auto* input_x = param.InputX();
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
-      }
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
@@ -21,23 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-// vector<int> pos;
-// template <typename T>
-// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-//                    const vector<int> old_strides, const vector<int>
-//                    new_strides, T* output) {
-//   for (int i = 0; i < numel; ++i) {
-//     int old_idx = 0;
-//     int idx = i;
-//     for (int j = 0; j < axis.size(); ++j) {
-//       int order = axis[j];
-//       old_idx += (idx / new_strides[j]) * old_strides[order];
-//       idx %= new_strides[j];
-//     }
-//     output[i] = input[old_idx];
-//   }
-// }
 template <typename P>
 void TransposeCompute(const TransposeParam<CPU>& param) {
  const auto* input_x = param.InputX();

--- a/src/operators/kernel/dequant_add_bn_kernel.h
+++ b/src/operators/kernel/dequant_add_bn_kernel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class FusionDequantAddBNKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNParam<DeviceType>> {
- public:
-  void Compute(const FusionDequantAddBNParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/dequant_bn_relu_kernel.h
+++ b/src/operators/kernel/dequant_bn_relu_kernel.h
@@ -20,26 +20,37 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
+#define DECLARE_KERNEL(KernelClass, KernelParam)                              \
+  template <typename DeviceType, typename T>                                  \
+  class KernelClass                                                           \
+      : public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
+   public:                                                                    \
+    bool Init(KernelParam<DeviceType> *param);                                \
+    void Compute(const KernelParam<DeviceType> &param);                       \
+  };
+#ifdef FUSION_DEQUANT_BN_OP
+DECLARE_KERNEL(FusionDequantBNKernel, FusionDequantBNParam);
+#endif
 #ifdef FUSION_DEQUANT_BN_RELU_OP
-template <typename DeviceType, typename T>
+DECLARE_KERNEL(FusionDequantBNReluKernel, FusionDequantBNParam);
-class FusionDequantBNReluKernel
+#endif
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantBNReluParam<DeviceType>> {
+#ifdef FUSION_DEQUANT_ADD_BN_OP
- public:
+DECLARE_KERNEL(FusionDequantAddBNKernel, FusionDequantAddBNParam);
-  void Compute(const FusionDequantBNReluParam<DeviceType> &param);
-  bool Init(FusionDequantBNReluParam<DeviceType> *param);
-};
 #endif
 #ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <typename DeviceType, typename T>
+DECLARE_KERNEL(FusionDequantAddBNReluKernel, FusionDequantAddBNParam);
-class FusionDequantAddBNReluKernel
+#endif
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNReluParam<DeviceType>> {
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
- public:
+DECLARE_KERNEL(FusionDequantAddBNQuantKernel, FusionDequantAddBNQuantParam);
-  void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
+#endif
-  bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
-};
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+DECLARE_KERNEL(FusionDequantAddBNReluQuantKernel, FusionDequantAddBNQuantParam);
 #endif
 }  // namespace operators

--- a/src/operators/kernel/feed_kernel.h
+++ b/src/operators/kernel/feed_kernel.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
 template <typename DeviceType, typename T>
 class FeedKernel
    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {

--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #pragma once
 #include "framework/operator.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -30,6 +29,15 @@ class ReluKernel
  void Compute(const ReluParam<DeviceType>& param);
  bool Init(ReluParam<DeviceType>* param);
 };
+template <typename DeviceType, typename T>
+class Relu6Kernel
+    : public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
+ public:
+  void Compute(const ReluParam<DeviceType>& param);
+  bool Init(ReluParam<DeviceType>* param);
+};
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/activation_functions.h
+++ b/src/operators/math/activation_functions.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <math.h>
+#include <algorithm>
+#include <cmath>
 #include <string>
 #include "common/enforce.h"
+#include "common/types.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#include "operators/math/math_func_neon.h"
+#endif
 namespace paddle_mobile {
 namespace operators {
 namespace math {
@@ -24,68 +32,92 @@ namespace math {
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0
-enum ActivationType {
-  kSigmoid,
-  kReLU,
-  kTanh,
-  kIdentity,
-};
 inline ActivationType GetActivationType(const std::string &type) {
  if (type == "sigmoid") {
-    return ActivationType::kSigmoid;
+    return ActivationType::SIGMOID;
  } else if (type == "relu") {
-    return ActivationType::kReLU;
+    return ActivationType::RELU;
  } else if (type == "tanh") {
-    return ActivationType::kTanh;
+    return ActivationType::TANH;
  } else if (type == "identity" || type == "") {
-    return ActivationType::kIdentity;
+    return ActivationType::IDENTITY;
  }
  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
 }
-namespace forward {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <ActivationType Act = IDENTITY>
+inline float32x4_t vActiveq_f32(const float32x4_t &x) {
+  return x;
+}
-template <typename T>
+template <>
-T Identity(const T a) {
+inline float32x4_t vActiveq_f32<RELU>(const float32x4_t &x) {
-  return a;
+  float32x4_t __zero = vdupq_n_f32(0.f);
+  return vmaxq_f32(x, __zero);
 }
-template <typename T>
+template <>
-T Relu(const T a) {
+inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x) {
-  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
+  float32x4_t __zero = vdupq_n_f32(0.f);
+  float32x4_t __six = vdupq_n_f32(6.f);
+  return vminq_f32(vmaxq_f32(x, __zero), __six);
 }
-template <typename T>
+template <>
-T Sigmoid(const T a) {
+inline float32x4_t vActiveq_f32<SIGMOID>(const float32x4_t &x) {
-  const T min = SIGMOID_THRESHOLD_MIN;
+  float32x4_t __one = vdupq_n_f32(1.f);
-  const T max = SIGMOID_THRESHOLD_MAX;
+  float32x4_t __x = vnegq_f32(x);
-  T tmp = (a < min) ? min : ((a > max) ? max : a);
+  __x = exp_ps(__x);
-  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
+  __x = vaddq_f32(__x, __one);
+  float32x4_t __out = vrecpeq_f32(__x);
+  return vmulq_f32(vrecpsq_f32(__x, __out), __out);
 }
-template <typename T>
+template <>
-T Tanh(const T a) {
+inline float32x4_t vActiveq_f32<TANH>(const float32x4_t &x) {
-  T tmp = -2.0 * a;
+  float32x4_t __one = vdupq_n_f32(1.f);
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  float32x4_t __x = vnegq_f32(x);
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+  __x = vmulq_n_f32(__x, 2.f);
+  __x = exp_ps(__x);
+  __x = vaddq_f32(__x, __one);
+  float32x4_t __out = vrecpeq_f32(__x);
+  __out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
+  __out = vmulq_n_f32(__out, 2.f);
+  return vsubq_f32(__out, __one);
 }
+#endif
-}  // namespace forward
+template <ActivationType Act = IDENTITY>
+inline float Active(const float &x) {
+  return x;
+}
-template <typename T>
+template <>
-struct Active {
+inline float Active<RELU>(const float &x) {
-  typedef T (*Act)(T);
+  return std::max(x, 0.f);
-};
+}
-static Active<float>::Act kActFloat[] = {
+template <>
-    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
+inline float Active<RELU6>(const float &x) {
-    &forward::Identity<float>};
+  return std::min(std::max(x, 0.f), 6.f);
+}
-namespace forward {
+template <>
-inline float activation(float a, int index) { return kActFloat[index](a); }
+inline float Active<SIGMOID>(const float &x) {
+  //  float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x;
+  //  tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN;
+  //  return 1.f / (1.f + exp(-tmp));
+  return 1.f / (1.f + exp(-x));
+}
-}  // namespace forward
+template <>
+inline float Active<TANH>(const float &x) {
+  //  float tmp = -2.f * x;
+  //  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  //  return (2.f / (1.f + exp(tmp))) - 1.f;
+  return 2.f / (1.f + exp(-2.f * x)) - 1.f;
+}
 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -105,16 +105,15 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                            float *c, float *C, int ldc, float *p,
                            std::string mode, float *bias, float *bias1);
-  /*
  // 向量矩阵乘法 (M = 1)
  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                    const float *B, int ldb, float beta, float *C, int ldc,
                    bool relu);
+  /*
-  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+    void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                          int lda, const float *B, int ldb, float beta, float
+                            int lda, const float *B, int ldb, float beta, float
-  *C, int ldc, bool relu, float *new_scale, float *new_bias);
+    *C, int ldc, bool relu, float *new_scale, float *new_bias);
-  */
+    */
  // 计算一个更小的 C 矩阵分块
  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
@@ -149,7 +148,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
                          float *new_scale, float *new_bias, float *bias1);
-  /*
  // 向量矩阵乘法结果回写
  // C = A * B
  void VecWriteBasic(int n, float *c, float *C, int ldc);
@@ -159,13 +157,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
  // C = A * B + C, relu(C)
  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-  // C = A * B, batchnorm(C)
+  /*
-  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+    // C = A * B, batchnorm(C)
-                      float *new_bias);
+    void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-  // C = A * B, batchnorm(C), relu(C)
+                        float *new_bias);
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+    // C = A * B, batchnorm(C), relu(C)
-                          float *new_bias);
+    void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
-  */
+    *new_scale, float *new_bias);
+    */
  // 32位 float 矩阵乘法
  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
@@ -392,7 +391,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
    packedB_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
 #if __aarch64__
-    // TODO()
+    // TODO(paddle mobile)
 #else
    PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
 #endif
@@ -414,7 +413,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
    packedA_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
 #if __aarch64__
-    // TODO()
+    // TODO(paddle mobile)
 #else
    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
 #endif
@@ -438,7 +437,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
 #if __aarch64__
-      // TODO()
+      // TODO(paddle mobile)
 #else
      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
 #endif
@@ -468,7 +467,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
 #if __aarch64__
-      // TODO()
+      // TODO(paddle mobile)
 #else
      PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
 #endif

--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -11,13 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef GRU_OP
 #include "operators/math/gru_compute.h"
 #include "common/types.h"
-#include "operators/math/activation_functions.h"
+#include "operators/math/activation.h"
 #include "operators/math/gemm.h"
 #include "operators/math/gru_cpu_kernel.h"
-#include "operators/math/gru_kernel.h"
 namespace paddle_mobile {
 namespace operators {
@@ -43,8 +44,7 @@ struct GRUUnitFunctor<CPU, T> {
 #endif
    }
-    forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
+    forward_reset_output(value, frame_size, batch_size, active_gate);
-                         batch_size, active_gate);
    if (value.prev_out_value) {
 #ifdef _OPENMP
@@ -60,8 +60,7 @@ struct GRUUnitFunctor<CPU, T> {
 #endif
    }
-    forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
+    forward_final_output(value, frame_size, batch_size, active_node);
-                         batch_size, active_node);
  }
 };

--- a/src/operators/math/gru_compute.h
+++ b/src/operators/math/gru_compute.h
@@ -11,7 +11,7 @@ limitations under the License. */
 #ifdef GRU_OP
 #pragma once
-#include "operators/math/activation_functions.h"
+#include "operators/math/activation.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/math/gru_cpu_kernel.h
+++ b/src/operators/math/gru_cpu_kernel.h
@@ -11,21 +11,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef GRU_OP
 #pragma once
 #include <type_traits>
-#include "operators/math/activation_functions.h"
+#include "operators/math/activation.h"
 #include "operators/math/gru_compute.h"
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-template <class OpResetOutput, typename T>
+template <typename T, ActivationType Act>
-void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value,
-                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size) {
-                                       T *prev_output_value, int frame_size,
-                                       ActivationType active_gate) {
  T r_value_update_gate;
  T r_value_reset_gate;
  T r_value_reset_output;
@@ -33,27 +34,57 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
  T *update_gate = gate_value;
  T *reset_gate = gate_value + frame_size;
-  for (int i = 0; i < frame_size; i++) {
+  int remain = frame_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  int loop = remain >> 3;
+  remain = remain & 0x7;
+  float32x4_t prev0 = vdupq_n_f32(0.f);
+  float32x4_t prev1 = vdupq_n_f32(0.f);
+  for (int i = 0; i < loop; ++i) {
+    float32x4_t update0 = vld1q_f32(update_gate);
+    float32x4_t update1 = vld1q_f32(update_gate + 4);
+    float32x4_t reset0 = vld1q_f32(reset_gate);
+    float32x4_t reset1 = vld1q_f32(reset_gate + 4);
+    if (prev_output_value) {
+      prev0 = vld1q_f32(prev_output_value);
+      prev1 = vld1q_f32(prev_output_value + 4);
+      prev_output_value += 8;
+    }
+    update0 = vActiveq_f32<Act>(update0);
+    update1 = vActiveq_f32<Act>(update1);
+    reset0 = vActiveq_f32<Act>(reset0);
+    reset1 = vActiveq_f32<Act>(reset1);
+    float32x4_t output0 = vmulq_f32(prev0, reset0);
+    float32x4_t output1 = vmulq_f32(prev1, reset1);
+    vst1q_f32(update_gate, update0);
+    vst1q_f32(update_gate + 4, update1);
+    vst1q_f32(reset_gate, reset0);
+    vst1q_f32(reset_gate + 4, reset1);
+    vst1q_f32(reset_output_value, output0);
+    vst1q_f32(reset_output_value + 4, output1);
+    update_gate += 8;
+    reset_gate += 8;
+    reset_output_value += 8;
+  }
+#endif  // __ARM_NEON__
+  for (int i = 0; i < remain; i++) {
    r_value_update_gate = update_gate[i];
    r_value_reset_gate = reset_gate[i];
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
    }
+    r_value_update_gate = Active<Act>(r_value_update_gate);
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+    r_value_reset_gate = Active<Act>(r_value_reset_gate);
-                    &r_value_reset_output, active_gate);
+    r_value_reset_output = r_prev_out * r_value_reset_gate;
    update_gate[i] = r_value_update_gate;
    reset_gate[i] = r_value_reset_gate;
    reset_output_value[i] = r_value_reset_output;
  }
 }
-template <class OpFinalOutput, typename T>
+template <typename T, ActivationType Act>
-void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value,
-                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size) {
-                                       T *output_value, int frame_size,
-                                       ActivationType active_node) {
  T r_value_update_gate;
  T r_value_frame_state;
  T r_prev_out = 0;
@@ -61,30 +92,73 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
  T *update_gate = gate_value;
  T *frame_state = gate_value + frame_size * 2;
-  for (int i = 0; i < frame_size; i++) {
+  int remain = frame_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  int loop = remain >> 3;
+  remain = remain & 0x7;
+  float32x4_t prev0 = vdupq_n_f32(0.f);
+  float32x4_t prev1 = vdupq_n_f32(0.f);
+  for (int i = 0; i < loop; ++i) {
+    float32x4_t update0 = vld1q_f32(update_gate);
+    float32x4_t update1 = vld1q_f32(update_gate + 4);
+    float32x4_t state0 = vld1q_f32(frame_state);
+    float32x4_t state1 = vld1q_f32(frame_state + 4);
+    if (prev_output_value) {
+      prev0 = vld1q_f32(prev_output_value);
+      prev1 = vld1q_f32(prev_output_value + 4);
+      prev_output_value += 8;
+    }
+    state0 = vActiveq_f32<Act>(state0);
+    state1 = vActiveq_f32<Act>(state1);
+    float32x4_t output0 = vmlsq_f32(prev0, update0, prev0);
+    float32x4_t output1 = vmlsq_f32(prev1, update1, prev1);
+    output0 = vmlaq_f32(output0, update0, state0);
+    output1 = vmlaq_f32(output1, update1, state1);
+    vst1q_f32(frame_state, state0);
+    vst1q_f32(frame_state + 4, state1);
+    vst1q_f32(output_value, output0);
+    vst1q_f32(output_value + 4, output1);
+    update_gate += 8;
+    frame_state += 8;
+    output_value += 8;
+  }
+#endif  // __ARM_NEON__
+  for (int i = 0; i < remain; i++) {
    r_value_update_gate = update_gate[i];
    r_value_frame_state = frame_state[i];
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
    }
+    r_value_frame_state = Active<Act>(r_value_frame_state);
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+    r_output = r_prev_out - r_value_update_gate * r_prev_out +
-                    &r_output, active_node);
+               r_value_update_gate * r_value_frame_state;
    frame_state[i] = r_value_frame_state;
    output_value[i] = r_output;
  }
 }
-template <class OpResetOutput, typename T>
+#define FORWARD_RESET_OUTPUT(active_type, value, frame_size)            \
-inline void forward_reset_output(OpResetOutput op_reset_output,
+  hl_naive_gru_forward_reset_output<float, active_type>(                \
-                                 GRUMetaValue<T> value, int frame_size,
+      value.gate_value, value.reset_output_value, value.prev_out_value, \
-                                 int batch_size, ActivationType active_gate) {
+      frame_size);
-  for (int b = 0; b < batch_size; b++) {
-    hl_naive_gru_forward_reset_output(
-        op_reset_output, value.gate_value, value.reset_output_value,
-        value.prev_out_value, frame_size, active_gate);
+template <typename T>
+inline void forward_reset_output(GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; ++b) {
+    switch (active_node) {
+      case RELU:
+        FORWARD_RESET_OUTPUT(RELU, value, frame_size);
+        break;
+      case SIGMOID:
+        FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size);
+        break;
+      case TANH:
+        FORWARD_RESET_OUTPUT(TANH, value, frame_size);
+        break;
+      default:
+        FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size);
+    }
    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
    if (value.prev_out_value) {
@@ -93,15 +167,27 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
  }
 }
-template <class OpFinalOutput, typename T>
+#define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \
-inline void forward_final_output(OpFinalOutput op_final_output,
+  hl_naive_gru_forward_final_output<float, active_type>(     \
-                                 GRUMetaValue<T> value, int frame_size,
+      value.gate_value, value.prev_out_value, value.output_value, frame_size)
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; b++) {
-    hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node);
+template <typename T>
+inline void forward_final_output(GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; ++b) {
+    switch (active_node) {
+      case RELU:
+        FORWARD_FINAL_OUTPUT(RELU, value, frame_size);
+        break;
+      case SIGMOID:
+        FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size);
+        break;
+      case TANH:
+        FORWARD_FINAL_OUTPUT(TANH, value, frame_size);
+        break;
+      default:
+        FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size);
+    }
    value.gate_value += frame_size * 3;
    value.output_value += frame_size;
    if (value.prev_out_value) {
@@ -113,4 +199,5 @@ inline void forward_final_output(OpFinalOutput op_final_output,
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
--- a/src/operators/math/gru_kernel.h
+++ b/src/operators/math/gru_kernel.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef GRU_OP
-#pragma once
-#include <type_traits>
-#include "operators/math/activation_functions.h"
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-namespace forward {
-template <typename T>
-class gru_resetOutput {
- public:
-  void operator()(T *value_update_gate, T *value_reset_gate, T *prev_out,
-                  T *value_reset_output, ActivationType act_gate) {
-    *value_update_gate = activation(*value_update_gate, act_gate);
-    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = (*prev_out) * (*value_reset_gate);
-  }
-};
-template <typename T>
-class gru_finalOutput {
- public:
-  void operator()(T *value_update_gate, T *value_frame_state, T *prev_out,
-                  T *value_output, ActivationType act_input) {
-    *value_frame_state = activation(*value_frame_state, act_input);
-    *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
-                    ((*value_update_gate) * (*value_frame_state));
-  }
-};
-}  // namespace forward
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -41,10 +41,10 @@ void set_constant(framework::Tensor *tensor, float value) {
 }
 template <>
-void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
+void matmul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
-                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
+                          const framework::Tensor &matrix_b, bool trans_b,
-                   framework::Tensor *matrix_out, float beta, bool relu,
+                          float alpha, framework::Tensor *matrix_out,
-                   float *bias) {
+                          float beta, bool relu, float *bias) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -24,24 +24,24 @@ namespace math {
 void set_constant(framework::Tensor *tensor, float value);
-template <typename T>
+template <typename Itype, typename Otype>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, T alpha,
+            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, T beta, bool relu = false,
+            framework::Tensor *matrix_out, float beta, bool relu = false,
-            float *bias = nullptr);
+            Otype *bias = nullptr);
-template <typename T, typename S>
+template <typename Itype, typename Otype>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, T alpha,
+            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, T beta, bool relu = false,
+            framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
-            S *bias = nullptr, bool addOnRow = false);
+            bool addOnRow);
 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
-                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *matrix_out, float beta, bool relu,
                  framework::Tensor *new_scale, framework::Tensor *new_bias,
-                  int group, float *bias = nullptr);
+                  int group, T *bias = nullptr);
 void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
                     const framework::Tensor &matrix_b, bool trans_b,

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -22,10 +22,11 @@ namespace operators {
 namespace math {
 template <>
-void matmul(const framework::Tensor &matrix_a, bool trans_a,
+void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, float alpha,
+                             const framework::Tensor &matrix_b, bool trans_b,
-            framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
+                             float alpha, framework::Tensor *matrix_out,
-            bool addOnRow) {
+                             float beta, bool relu, int32_t *bias,
+                             bool addOnRow) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -93,6 +94,16 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
 #endif
  }
 }
+template <>
+void matmul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
+                             const framework::Tensor &matrix_b, bool trans_b,
+                             float alpha, framework::Tensor *matrix_out,
+                             float beta, bool relu, int32_t *bias) {
+  matmul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
+                          matrix_out, beta, relu, bias, false);
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-#include "operators/math/pool_2x2.h"
-#include <algorithm>
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-#define FLT_MAX __FLT_MAX__
-void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *input, Tensor *output) {
-  const int batch_size = input->dims()[0];
-  const int input_height = input->dims()[2];
-  const int input_width = input->dims()[3];
-  const int output_channels = output->dims()[1];
-  int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int ksize_height = 2;
-  const int ksize_width = 2;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  int w1 = input_width / 16;
-  int _w1 = input_width % 16;
-  int w2 = _w1 / 4;
-  int _w2 = _w1 % 4;
-  for (int i = 0; i < batch_size; ++i) {
-    for (int c = 0; c < output_channels; ++c) {
-      for (int ph = 0; ph < input_height; ph += 2) {
-        const float *in_ptr1 = input_data + i * input_batch_stride +
-                               c * input_channel_stride + ph * input_width;
-        const float *in_ptr2 = in_ptr1 + input_width;
-        if (ph != input_height && ph + 1 >= input_height) {
-          in_ptr2 = static_cast<float *>(
-              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
-          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
-                 sizeof(float) * input_width);
-        }
-        float *out_ptr = output_data + i * output_batch_stride +
-                         c * output_channel_stride + ph / 2 * output_width;
-#if __ARM_NEON
-#if __aarch64__
-#else
-        asm volatile(
-            "subs       %[w1], %[w1], #1        \n\t"
-            "blt        end_w1_%=               \n\t"
-            "loop_w1_%=:                        \n\t"
-            "pld        [%[in_ptr1], #64]       \n\t"
-            "pld        [%[in_ptr2], #64]       \n\t"
-            "vld1.f32   {q0, q1},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q2, q3},   [%[in_ptr2]]!   \n\t"
-            "vld1.f32   {q6, q7},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q8, q9},   [%[in_ptr2]]!   \n\t"
-            "vmax.f32   q0,     q0,   q2        \n\t"
-            "vmax.f32   q1,     q1,   q3        \n\t"
-            "vmax.f32   q6,     q6,   q8        \n\t"
-            "vmax.f32   q7,     q7,   q9        \n\t"
-            "vpmax.f32  d8,     d0,   d1        \n\t"
-            "vpmax.f32  d9,     d2,   d3        \n\t"
-            "vpmax.f32  d10,    d12,  d13       \n\t"
-            "vpmax.f32  d11,    d14,  d15       \n\t"
-            "vst1.32  {q4, q5},  [%[out_ptr]]!  \n\t"
-            "subs       %[w1], %[w1], #1        \n\t"
-            "bge        loop_w1_%=              \n\t"
-            "end_w1_%=:                         \n\t"
-            "subs       %[w2], %[w2], #1        \n\t"
-            "blt        end_w2_%=               \n\t"
-            "loop_w2_%=:                        \n\t"
-            "vld1.f32   {q0},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q1},   [%[in_ptr2]]!   \n\t"
-            "vmax.f32   q0,     q0,   q1        \n\t"
-            "vpmax.f32  d4,     d0,   d1        \n\t"
-            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
-            "subs       %[w2], %[w2], #1        \n\t"
-            "bge        loop_w2_%=              \n\t"
-            "end_w2_%=:                         \n\t"
-            :
-            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
-              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
-            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-              "q9");
-#endif
-#endif
-        if (_w2 != 0) {
-          in_ptr1 = input_data + i * input_batch_stride +
-                    c * input_channel_stride + ph * input_width + 16 * w1 +
-                    4 * w2;
-          in_ptr2 = in_ptr1 + input_width;
-          out_ptr = output_data + i * output_batch_stride +
-                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
-                    2 * w2;
-          if (_w2 == 1) {
-            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-          } else if (_w2 == 2) {
-            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            *out_ptr = (temp > temp1) ? temp : temp1;
-          } else if (_w2 == 3) {
-            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            *out_ptr = (temp > temp1) ? temp : temp1;
-            out_ptr++;
-            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-          }
-        }
-      }
-    }
-  }
-}
-void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *input, Tensor *output) {
-  const int batch_size = input->dims()[0];
-  const int input_height = input->dims()[2];
-  const int input_width = input->dims()[3];
-  const int output_channels = output->dims()[1];
-  int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int ksize_height = 2;
-  const int ksize_width = 2;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  int w1 = input_width / 16;
-  int _w1 = input_width % 16;
-  int w2 = _w1 / 4;
-  int _w2 = _w1 % 4;
-  float quarter = 0.25;
-  for (int i = 0; i < batch_size; ++i) {
-    for (int c = 0; c < output_channels; ++c) {
-      for (int ph = 0; ph < input_height; ph += 2) {
-        const float *in_ptr1 = input_data + i * input_batch_stride +
-                               c * input_channel_stride + ph * input_width;
-        const float *in_ptr2 = in_ptr1 + input_width;
-        if (ph + 1 >= input_height) {
-          in_ptr2 = static_cast<float *>(
-              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
-          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
-                 sizeof(float) * input_width);
-        }
-        float *out_ptr = output_data + i * output_batch_stride +
-                         c * output_channel_stride + ph / 2 * output_width;
-#if __ARM_NEON
-#if __aarch64__
-#else
-        asm volatile(
-            "subs       %[w1], %[w1], #1        \n\t"
-            "blt        end_w1_%=               \n\t"
-            "loop_w1_%=:                        \n\t"
-            "pld        [%[in_ptr1], #64]       \n\t"
-            "pld        [%[in_ptr2], #64]       \n\t"
-            "vmov.f32   d0[0],      %[quarter]      \n\t"
-            "vld1.f32   {q1, q2},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q3, q4},   [%[in_ptr2]]!   \n\t"
-            "vld1.f32   {q7, q8},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q9, q10},  [%[in_ptr2]]!   \n\t"
-            "vadd.f32   q1,     q1,   q3        \n\t"
-            "vadd.f32   q2,     q2,   q4        \n\t"
-            "vadd.f32   q7,     q7,   q9        \n\t"
-            "vadd.f32   q8,     q8,   q10       \n\t"
-            "vpadd.f32  d10,    d2,   d3        \n\t"
-            "vpadd.f32  d11,    d4,   d5        \n\t"
-            "vpadd.f32  d12,    d14,  d15       \n\t"
-            "vpadd.f32  d13,    d16,  d17       \n\t"
-            "vmul.f32   q5,     q5,   d0[0]     \n\t"
-            "vmul.f32   q6,     q6,   d0[0]     \n\t"
-            "vst1.32  {q5, q6},  [%[out_ptr]]!  \n\t"
-            "subs       %[w1], %[w1], #1        \n\t"
-            "bge        loop_w1_%=              \n\t"
-            "end_w1_%=:                         \n\t"
-            "subs       %[w2], %[w2], #1        \n\t"
-            "blt        end_w2_%=               \n\t"
-            "loop_w2_%=:                        \n\t"
-            "vld1.f32   {q1},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q2},   [%[in_ptr2]]!   \n\t"
-            "vadd.f32   q1,     q1,   q2        \n\t"
-            "vpadd.f32  d4,     d2,   d3        \n\t"
-            "vmul.f32   d4,     d4,   d0[0]     \n\t"
-            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
-            "subs       %[w2], %[w2], #1        \n\t"
-            "bge        loop_w2_%=              \n\t"
-            "end_w2_%=:                         \n\t"
-            :
-            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
-              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
-              [quarter] "r"(quarter)
-            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-              "q9", "q10");
-#endif
-#endif
-        if (_w2 != 0) {
-          in_ptr1 = input_data + i * input_batch_stride +
-                    c * input_channel_stride + ph * input_width + 16 * w1 +
-                    4 * w2;
-          in_ptr2 = in_ptr1 + input_width;
-          out_ptr = output_data + i * output_batch_stride +
-                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
-                    2 * w2;
-          if (_w2 == 1) {
-            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
-          } else if (_w2 == 2) {
-            float temp = 0;
-            temp += *in_ptr1;
-            temp += *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            temp += *in_ptr1;
-            temp += *in_ptr2;
-            *out_ptr = 0.25 * temp;
-          } else if (_w2 == 3) {
-            float temp = 0;
-            temp += *in_ptr1++;
-            temp += *in_ptr2++;
-            temp += *in_ptr1++;
-            temp += *in_ptr2++;
-            *out_ptr = 0.25 * temp;
-            out_ptr++;
-            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
-          }
-        }
-      }
-    }
-  }
-}
-//}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-#pragma once
-#include "framework/tensor.h"
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif  // __ARM_NEON
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-using framework::Tensor;
-using std::vector;
-void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *input, Tensor *output);
-void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *in_x, Tensor *out);
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
--- a/src/operators/math/pool_3x3_int8.cpp
+++ b/src/operators/math/pool_3x3_int8.cpp
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
--- a/src/operators/math/pooling3x3.cpp
+++ b/src/operators/math/pooling3x3.cpp
--- a/src/operators/math/quantize.h
+++ b/src/operators/math/quantize.h
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
@@ -22,10 +22,7 @@ namespace operators {
 template <typename DeviceType, typename T>
 void QuantizeOp<DeviceType, T>::InferShape() const {
-  auto input_dims = this->param_.input_->dims();
+  const auto &input_dims = this->param_.input_->dims();
-  const std::vector<int> &paddings = this->param_.paddings_;
-  input_dims[2] += 2 * paddings[0];
-  input_dims[3] += 2 * paddings[1];
  this->param_.output_->Resize(input_dims);
  auto scale_dims = framework::make_ddim(std::vector<int>{1});
  this->param_.online_scale_->Resize(scale_dims);

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -324,10 +324,6 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-relu-op paddle-mobile)
-    # gen test
-    ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
    # gen test
    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
--- a/test/operators/test_conv_op.cpp
+++ b/test/operators/test_conv_op.cpp
--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
--- a/tools/op.cmake
+++ b/tools/op.cmake