Refactor dequant fusion kernels to support more fusion patterns

f6a7712f · hjchen2 · f163d146 · f6a7712f · f6a7712f · f6a7712f
20 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -72,6 +72,7 @@ const char *G_OP_TYPE_SUM = "sum";
 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
 const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
@@ -144,7 +145,8 @@ std::unordered_map<
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -87,10 +87,19 @@ enum PMStatus {
 };
 enum RoundType {
-  ROUND_UNK = 0,
+  ROUND_NEAREST_AWAY_ZERO = 0,
-  ROUND_NEAREST_AWAY_ZERO = 1,
+  ROUND_NEAREST_TOWARDS_ZERO = 1,
-  ROUND_NEAREST_TOWARDS_ZERO = 2,
+  ROUND_NEAREST_TO_EVEN = 2,
-  ROUND_NEAREST_TO_EVEN = 3
+};
+enum ActivationType {
+  Linear = 0,
+  Relu = 1,
+  Relu6 = 2,
+  PRelu = 3,
+  LeakyRelu = 4,
+  Tanh = 5,
+  Sigmoid = 6,
 };
 extern const char *G_OP_TYPE_CONV;
@@ -139,6 +148,7 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -234,6 +234,10 @@ LOAD_OP1(quantize, CPU);
 #ifdef DEQUANT_OP
 LOAD_OP1(dequantize, CPU);
 #endif
+#ifdef FUSION_DEQUANT_BN_OP
+LOAD_OP1(fusion_dequant_bn, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_bn);
+#endif
 #ifdef FUSION_DEQUANT_ADD_BN_OP
 LOAD_OP1(fusion_dequant_add_bn, CPU);
 LOAD_FUSION_MATCHER(fusion_dequant_add_bn);

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -220,7 +220,16 @@ void Node::Folder(
    }
  } else {
    for (auto &op_output : this->op_desc_->outputs_) {
-      op_desc->outputs_.emplace(op_output.first, op_output.second);
+      auto output_key = op_output.first;
+      if (change->find(this->type_) != change->end()) {
+        const auto change_pairs = (*change)[this->type_];
+        for (const auto &target : change_pairs) {
+          if (target.first == output_key) {
+            output_key = target.second;
+          }
+        }
+      }
+      op_desc->outputs_.emplace(output_key, op_output.second);
    }
    for (auto &output : this->outputs_) {

--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_add_bn_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }

--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_relu_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }
@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionDequantAddBNReluOp
    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+          DeviceType, FusionDequantAddBNParam<DeviceType>,
          operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
 public:
  FusionDequantAddBNReluOp(const std::string &type,
@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
                           const framework::AttributeMap &attrs,
                           std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
  // inference output shape

--- a/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_quant_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_relu_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -44,7 +44,8 @@ class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }
@@ -90,7 +91,8 @@ class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }

--- a/src/operators/fusion_dequant_bn_relu_op.cpp
+++ b/src/operators/fusion_dequant_bn_relu_op.cpp
@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_DEQUANT_BN_RELU_OP
+#include "operators/fusion_dequant_bn_op.h"
-#include "operators/fusion_dequant_bn_relu_op.h"
 namespace paddle_mobile {
 namespace operators {
+#ifdef FUSION_DEQUANT_BN_OP
+template <typename Dtype, typename T>
+void FusionDequantBNOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
+}
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
 template <typename Dtype, typename T>
 void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
  const auto& input_dims = this->param_.input_->dims();
  this->param_.output_->Resize(input_dims);
 }
+#endif  // FUSION_DEQUANT_BN_RELU_OP
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+#ifdef FUSION_DEQUANT_BN_OP
+REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
+#endif  // PADDLE_MOBILE_CPU
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
 REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
                        ops::FusionDequantBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
-#endif
+#endif  // PADDLE_MOBILE_CPU
+#endif  // FUSION_DEQUANT_BN_RELU_OP
-#endif
--- a/src/operators/fusion_dequant_bn_op.h
+++ b/src/operators/fusion_dequant_bn_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
+class FusionDequantBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+  virtual void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+  std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
+};
+#endif  // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
+#ifdef FUSION_DEQUANT_BN_OP
+template <typename DeviceType, typename T>
+class FusionDequantBNOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionDequantBNParam<DeviceType>,
+                              operators::FusionDequantBNKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNParam<DeviceType>,
+            operators::FusionDequantBNKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_BN_OP
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
+ public:
+  FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionDequantBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantBNParam<DeviceType>,
+          operators::FusionDequantBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNParam<DeviceType>,
+            operators::FusionDequantBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_BN_RELU_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }

--- a/src/operators/kernel/arm/dequant_add_bn_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_add_bn_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-#include "operators/kernel/dequant_add_bn_kernel.h"
-#include <cmath>
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool FusionDequantAddBNKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  // elementwise add params
-  const Tensor *bias = param->bias_;
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-  const float *bias_ptr = bias->data<float>();
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
-  }
-  return true;
-}
-template <>
-void FusionDequantAddBNKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  const int32_t *input = param.input_->data<int32_t>();
-  const float *bn_scale = param.bn_scale_->data<float>();
-  const float *bn_bias = param.bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param.activation_scale_->data<float>()[0];
-  const float weight_scale = param.weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-  float *output = param.output_->mutable_data<float>();
-  int batch_size = param.input_->dims()[0];
-  int channels = param.input_->dims()[1];
-  size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      // not fuse bn and dequant scale to minimize precision difference
-      // float scale = bn_scale[c] * dequant_scale;
-      float scale = bn_scale[c];
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmulq_f32(__dequant_scale, f0);
-        f1 = vmulq_f32(__dequant_scale, f1);
-        f2 = vmulq_f32(__dequant_scale, f2);
-        f3 = vmulq_f32(__dequant_scale, f3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = scale * (dequant_scale * x[k]) + bias;
-      }
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_DEQUANT_ADD_BN_OP
--- a/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "operators/kernel/dequant_bn_relu_kernel.h"
 #include <cmath>
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/math/activation.h"
 #include "operators/math/quantize.h"
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
@@ -22,8 +23,10 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-#if defined(FUSION_DEQUANT_BN_RELU_OP) ||     \
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
+    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
 void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
                                    const framework::Tensor *bias) {
@@ -39,16 +42,20 @@ void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
  float *bn_scale_ptr = bn_scale->mutable_data<float>();
  float *bn_bias_ptr = bn_bias->mutable_data<float>();
  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
+    float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
    float val = bias ? bias->data<float>()[c] : 0;
-    bn_bias_ptr[c] = inv_scale * (val - mean_ptr[c]) + bn_bias_ptr[c];
+    bn_bias_ptr[c] =
+        inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
+    bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
  }
 }
 #endif
-#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
+template <ActivationType Act>
+void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
  const int32_t *input = param->input_->data<int32_t>();
  const float *bn_scale = param->bn_scale_->data<float>();
  const float *bn_bias = param->bn_bias_->data<float>();
@@ -79,7 +86,6 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
      float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
      float32x4_t __scale = vdupq_n_f32(scale);
      float32x4_t __bias = vdupq_n_f32(bias);
-      float32x4_t __zero = vdupq_n_f32(0.f);
      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
        int32x4_t r0 = vld1q_s32(x);
        int32x4_t r1 = vld1q_s32(x + 4);
@@ -97,10 +103,10 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
        f1 = vmlaq_f32(__bias, __scale, f1);
        f2 = vmlaq_f32(__bias, __scale, f2);
        f3 = vmlaq_f32(__bias, __scale, f3);
-        f0 = vmaxq_f32(__zero, f0);
+        f0 = math::vActiveq_f32(f0);
-        f1 = vmaxq_f32(__zero, f1);
+        f1 = math::vActiveq_f32(f1);
-        f2 = vmaxq_f32(__zero, f2);
+        f2 = math::vActiveq_f32(f2);
-        f3 = vmaxq_f32(__zero, f3);
+        f3 = math::vActiveq_f32(f3);
        vst1q_f32(y, f0);
        vst1q_f32(y + 4, f1);
        vst1q_f32(y + 8, f2);
@@ -108,32 +114,62 @@ void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
      }
 #endif  // __ARM_NEON__
      for (int k = 0; k < remain; ++k) {
-        y[k] = std::max(scale * (dequant_scale * x[k]) + bias, 0.f);
+        y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
      }
    }
  }
 }
 #endif
+#ifdef FUSION_DEQUANT_BN_OP
+template <>
+bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
+  PublicFusionDequantBNInitParam(param, nullptr);
+  return true;
+}
+template <>
+void FusionDequantBNKernel<CPU, float>::Compute(
+    const FusionDequantBNParam<CPU> &param) {
+  DequantBNCompute<Linear>(&param);
+}
+#endif  // FUSION_DEQUANT_BN_OP
 #ifdef FUSION_DEQUANT_BN_RELU_OP
 template <>
 bool FusionDequantBNReluKernel<CPU, float>::Init(
-    FusionDequantBNReluParam<CPU> *param) {
+    FusionDequantBNParam<CPU> *param) {
  PublicFusionDequantBNInitParam(param, nullptr);
  return true;
 }
 template <>
 void FusionDequantBNReluKernel<CPU, float>::Compute(
-    const FusionDequantBNReluParam<CPU> &param) {
+    const FusionDequantBNParam<CPU> &param) {
-  DequantBNReluCompute(&param);
+  DequantBNCompute<Relu>(&param);
 }
 #endif  // FUSION_DEQUANT_BN_RELU_OP
+#ifdef FUSION_DEQUANT_ADD_BN_OP
+template <>
+bool FusionDequantAddBNKernel<CPU, float>::Init(
+    FusionDequantAddBNParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+template <>
+void FusionDequantAddBNKernel<CPU, float>::Compute(
+    const FusionDequantAddBNParam<CPU> &param) {
+  DequantBNCompute<Linear>(&param);
+}
+#endif  // FUSION_DEQUANT_ADD_BN_OP
 #ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
 template <>
 bool FusionDequantAddBNReluKernel<CPU, float>::Init(
-    FusionDequantAddBNReluParam<CPU> *param) {
+    FusionDequantAddBNParam<CPU> *param) {
  const framework::Tensor *bias = param->bias_;
  PublicFusionDequantBNInitParam(param, bias);
  return true;
@@ -141,13 +177,14 @@ bool FusionDequantAddBNReluKernel<CPU, float>::Init(
 template <>
 void FusionDequantAddBNReluKernel<CPU, float>::Compute(
-    const FusionDequantAddBNReluParam<CPU> &param) {
+    const FusionDequantAddBNParam<CPU> &param) {
-  DequantBNReluCompute(&param);
+  DequantBNCompute<Relu>(&param);
 }
 #endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
-template <RoundType R>
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+template <Activation Act, RoundType R>
 void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
  const int32_t *input = param->input_->data<int32_t>();
  const float *bn_scale = param->bn_scale_->data<float>();
@@ -167,9 +204,8 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
  //  if (param->is_static_) {
  if (true) {
-    max_abs = param->offline_scale_->data<float>()[0];
+    max_abs = param->static_scale_;
    float quant_scale = 127.f / max_abs;
    #pragma omp parallel for collapse(2)
    for (int batch = 0; batch < batch_size; ++batch) {
      for (int c = 0; c < channels; ++c) {
@@ -205,14 +241,18 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
          f1 = vmlaq_f32(__bias, __scale, f1);
          f2 = vmlaq_f32(__bias, __scale, f2);
          f3 = vmlaq_f32(__bias, __scale, f3);
+          f0 = math::vActiveq_f32(f0);
+          f1 = math::vActiveq_f32(f1);
+          f2 = math::vActiveq_f32(f2);
+          f3 = math::vActiveq_f32(f3);
          f0 = vmulq_f32(__quant_scale, f0);
          f1 = vmulq_f32(__quant_scale, f1);
          f2 = vmulq_f32(__quant_scale, f2);
          f3 = vmulq_f32(__quant_scale, f3);
-          int32x4_t q0 = math::vround_f32<R>(f0);
+          int32x4_t q0 = math::vRoundq_f32<R>(f0);
-          int32x4_t q1 = math::vround_f32<R>(f1);
+          int32x4_t q1 = math::vRoundq_f32<R>(f1);
-          int32x4_t q2 = math::vround_f32<R>(f2);
+          int32x4_t q2 = math::vRoundq_f32<R>(f2);
-          int32x4_t q3 = math::vround_f32<R>(f3);
+          int32x4_t q3 = math::vRoundq_f32<R>(f3);
          int16x4_t d0 = vmovn_s32(q0);
          int16x4_t d1 = vmovn_s32(q1);
          int16x4_t d2 = vmovn_s32(q2);
@@ -226,13 +266,14 @@ void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
        }
 #endif  // __ARM_NEON__
        for (int k = 0; k < remain; ++k) {
-          float x_temp = scale * (dequant_scale * x[k]) + bias;
+          float x_temp =
+              math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
          y[k] = math::Round<R>(x_temp * quant_scale);
        }
      }
    }
  } else {
-    // TODO(hjchen2)
+    // TODO
    max_abs = std::max(max_abs, 1e-6f);
  }
  param->online_scale_->mutable_data<float>()[0] = max_abs;
@@ -251,13 +292,13 @@ void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
    const FusionDequantAddBNQuantParam<CPU> &param) {
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
-      DequantBNQuantCompute<ROUND_NEAREST_TO_EVEN>(&param);
+      DequantBNQuantCompute<Linear, ROUND_NEAREST_TO_EVEN>(&param);
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
-      DequantBNQuantCompute<ROUND_NEAREST_TOWARDS_ZERO>(&param);
+      DequantBNQuantCompute<Linear, ROUND_NEAREST_TOWARDS_ZERO>(&param);
      break;
    case ROUND_NEAREST_AWAY_ZERO:
-      DequantBNQuantCompute<ROUND_NEAREST_AWAY_ZERO>(&param);
+      DequantBNQuantCompute<Linear, ROUND_NEAREST_AWAY_ZERO>(&param);
      break;
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";
@@ -267,105 +308,9 @@ void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
 #endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
 #ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-template <RoundType R>
-void DequantBNReluQuantCompute(
-    const FusionDequantAddBNReluQuantParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-  // quantize params
-  Tensor *output_scale = param->online_scale_;
-  float max_abs = 0.f;
-  int8_t *output = param->output_->mutable_data<int8_t>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-  //  if (param->is_static_) {
-  if (true) {
-    max_abs = param->offline_scale_->data<float>()[0];
-    float quant_scale = 127.f / max_abs;
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < batch_size; ++batch) {
-      for (int c = 0; c < channels; ++c) {
-        // float scale = bn_scale[c] * dequant_scale;
-        float scale = bn_scale[c];
-        float bias = bn_bias[c];
-        size_t offset = (batch * channels + c) * spatial_size;
-        const int32_t *x = input + offset;
-        int8_t *y = output + offset;
-        size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-        int loop = spatial_size >> 4;
-        remain = spatial_size & 0xF;
-        float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
-        float32x4_t __scale = vdupq_n_f32(scale);
-        float32x4_t __bias = vdupq_n_f32(bias);
-        float32x4_t __zero = vdupq_n_f32(0.f);
-        float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
-        for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-          int32x4_t r0 = vld1q_s32(x);
-          int32x4_t r1 = vld1q_s32(x + 4);
-          int32x4_t r2 = vld1q_s32(x + 8);
-          int32x4_t r3 = vld1q_s32(x + 12);
-          float32x4_t f0 = vcvtq_f32_s32(r0);
-          float32x4_t f1 = vcvtq_f32_s32(r1);
-          float32x4_t f2 = vcvtq_f32_s32(r2);
-          float32x4_t f3 = vcvtq_f32_s32(r3);
-          f0 = vmulq_f32(__dequant_scale, f0);
-          f1 = vmulq_f32(__dequant_scale, f1);
-          f2 = vmulq_f32(__dequant_scale, f2);
-          f3 = vmulq_f32(__dequant_scale, f3);
-          f0 = vmlaq_f32(__bias, __scale, f0);
-          f1 = vmlaq_f32(__bias, __scale, f1);
-          f2 = vmlaq_f32(__bias, __scale, f2);
-          f3 = vmlaq_f32(__bias, __scale, f3);
-          f0 = vmaxq_f32(__zero, f0);
-          f1 = vmaxq_f32(__zero, f1);
-          f2 = vmaxq_f32(__zero, f2);
-          f3 = vmaxq_f32(__zero, f3);
-          f0 = vmulq_f32(__quant_scale, f0);
-          f1 = vmulq_f32(__quant_scale, f1);
-          f2 = vmulq_f32(__quant_scale, f2);
-          f3 = vmulq_f32(__quant_scale, f3);
-          int32x4_t q0 = math::vround_f32<R>(f0);
-          int32x4_t q1 = math::vround_f32<R>(f1);
-          int32x4_t q2 = math::vround_f32<R>(f2);
-          int32x4_t q3 = math::vround_f32<R>(f3);
-          int16x4_t d0 = vmovn_s32(q0);
-          int16x4_t d1 = vmovn_s32(q1);
-          int16x4_t d2 = vmovn_s32(q2);
-          int16x4_t d3 = vmovn_s32(q3);
-          int16x8_t q5 = vcombine_s16(d0, d1);
-          int16x8_t q6 = vcombine_s16(d2, d3);
-          int8x8_t d5 = vmovn_s16(q5);
-          int8x8_t d6 = vmovn_s16(q6);
-          vst1_s8(y, d5);
-          vst1_s8(y + 8, d6);
-        }
-#endif  // __ARM_NEON__
-        for (int k = 0; k < remain; ++k) {
-          float x_temp = std::max(scale * (dequant_scale * x[k]) + bias, 0.f);
-          y[k] = math::Round<R>(x_temp * quant_scale);
-        }
-      }
-    }
-  } else {
-    // TODO(hjchen2)
-    max_abs = std::max(max_abs, 1e-6f);
-  }
-  param->online_scale_->mutable_data<float>()[0] = max_abs;
-}
 template <>
 bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
-    FusionDequantAddBNReluQuantParam<CPU> *param) {
+    FusionDequantAddBNQuantParam<CPU> *param) {
  const framework::Tensor *bias = param->bias_;
  PublicFusionDequantBNInitParam(param, bias);
  return true;
@@ -373,16 +318,16 @@ bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
 template <>
 void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
-    const FusionDequantAddBNReluQuantParam<CPU> &param) {
+    const FusionDequantAddBNQuantParam<CPU> &param) {
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
-      DequantBNReluQuantCompute<ROUND_NEAREST_TO_EVEN>(&param);
+      DequantBNQuantCompute<Relu, ROUND_NEAREST_TO_EVEN>(&param);
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
-      DequantBNReluQuantCompute<ROUND_NEAREST_TOWARDS_ZERO>(&param);
+      DequantBNQuantCompute<Relu, ROUND_NEAREST_TOWARDS_ZERO>(&param);
      break;
    case ROUND_NEAREST_AWAY_ZERO:
-      DequantBNReluQuantCompute<ROUND_NEAREST_AWAY_ZERO>(&param);
+      DequantBNQuantCompute<Relu, ROUND_NEAREST_AWAY_ZERO>(&param);
      break;
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";

--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -53,10 +53,10 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
    r1 = vmulq_f32(r1, __scale);
    r2 = vmulq_f32(r2, __scale);
    r3 = vmulq_f32(r3, __scale);
-    int32x4_t q0 = math::vround_f32<R>(r0);
+    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-    int32x4_t q1 = math::vround_f32<R>(r1);
+    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-    int32x4_t q2 = math::vround_f32<R>(r2);
+    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-    int32x4_t q3 = math::vround_f32<R>(r3);
+    int32x4_t q3 = math::vRoundq_f32<R>(r3);
    int16x4_t d0 = vmovn_s32(q0);
    int16x4_t d1 = vmovn_s32(q1);
    int16x4_t d2 = vmovn_s32(q2);

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef RELU_OP
 #include "operators/kernel/relu_kernel.h"
+#include "common/types.h"
+#include "operators/math/activation.h"
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
 #endif
@@ -22,45 +24,13 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-enum ReluMode {
+template <typename Dtype, ActivationType Act>
-  Relu = 0,
-  Relu6 = 1,
-  PRelu = 2,
-  LeakyRelu = 3,
-};
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <ReluMode R = Relu>
-inline float32x4_t vRelu_f32(const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  return vmaxq_f32(__zero, x);
-}
-template <>
-inline float32x4_t vRelu_f32<Relu6>(const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  float32x4_t __six = vdupq_n_f32(6.f);
-  return vminq_f32(__six, vmaxq_f32(__zero, x));
-}
-#endif
-template <ReluMode R = Relu>
-inline float ReluFunc(const float &x) {
-  return std::max(x, 0.f);
-}
-template <>
-inline float ReluFunc<Relu6>(const float &x) {
-  return std::min(std::max(x, 0.f), 6.f);
-}
-template <typename Dtype, ReluMode R>
 struct ReluCompute {
  void operator()(const Tensor *input, Tensor *output) {}
 };
-template <ReluMode R>
+template <ActivationType Act>
-struct ReluCompute<float, R> {
+struct ReluCompute<float, Act> {
  void operator()(const Tensor *input, Tensor *output) {
    const float *x = input->data<float>();
    float *y = output->mutable_data<float>();
@@ -77,10 +47,10 @@ struct ReluCompute<float, R> {
      float32x4_t r1 = vld1q_f32(local_x + 4);
      float32x4_t r2 = vld1q_f32(local_x + 8);
      float32x4_t r3 = vld1q_f32(local_x + 12);
-      r0 = vRelu_f32<R>(r0);
+      r0 = math::vActiveq_f32<Act>(r0);
-      r1 = vRelu_f32<R>(r1);
+      r1 = math::vActiveq_f32<Act>(r1);
-      r2 = vRelu_f32<R>(r2);
+      r2 = math::vActiveq_f32<Act>(r2);
-      r3 = vRelu_f32<R>(r3);
+      r3 = math::vActiveq_f32<Act>(r3);
      vst1q_f32(local_y, r0);
      vst1q_f32(local_y + 4, r1);
      vst1q_f32(local_y + 8, r2);
@@ -90,7 +60,7 @@ struct ReluCompute<float, R> {
    y += (loop << 4);
 #endif
    for (size_t i = 0; i < remain; ++i) {
-      y[i] = ReluFunc<R>(x[i]);
+      y[i] = math::Active<Act>(x[i]);
    }
  }
 };

--- a/src/operators/kernel/dequant_bn_relu_kernel.h
+++ b/src/operators/kernel/dequant_bn_relu_kernel.h
@@ -20,48 +20,37 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
+#define DECLARE_KERNEL(KernelClass, KernelParam)                              \
+  template <typename DeviceType, typename T>                                  \
+  class KernelClass                                                           \
+      : public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
+   public:                                                                    \
+    bool Init(KernelParam<DeviceType> *param);                                \
+    void Compute(const KernelParam<DeviceType> &param);                       \
+  };
+#ifdef FUSION_DEQUANT_BN_OP
+DECLARE_KERNEL(FusionDequantBNKernel, FusionDequantBNParam);
+#endif
 #ifdef FUSION_DEQUANT_BN_RELU_OP
-template <typename DeviceType, typename T>
+DECLARE_KERNEL(FusionDequantBNReluKernel, FusionDequantBNParam);
-class FusionDequantBNReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDequantBNReluParam<DeviceType> &param);
-  bool Init(FusionDequantBNReluParam<DeviceType> *param);
-};
 #endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+#ifdef FUSION_DEQUANT_ADD_BN_OP
-template <typename DeviceType, typename T>
+DECLARE_KERNEL(FusionDequantAddBNKernel, FusionDequantAddBNParam);
-class FusionDequantAddBNReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
-};
 #endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <typename DeviceType, typename T>
+DECLARE_KERNEL(FusionDequantAddBNReluKernel, FusionDequantAddBNParam);
-class FusionDequantAddBNReluQuantKernel
-    : public framework::OpKernelBase<
-          DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>> {
- public:
-  void Compute(const FusionDequantAddBNReluQuantParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNReluQuantParam<DeviceType> *param);
-};
 #endif
 #ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-template <typename DeviceType, typename T>
+DECLARE_KERNEL(FusionDequantAddBNQuantKernel, FusionDequantAddBNQuantParam);
-class FusionDequantAddBNQuantKernel
+#endif
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNQuantParam<DeviceType>> {
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
- public:
+DECLARE_KERNEL(FusionDequantAddBNReluQuantKernel, FusionDequantAddBNQuantParam);
-  void Compute(const FusionDequantAddBNQuantParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNQuantParam<DeviceType> *param);
-};
 #endif
 }  // namespace operators

--- a/src/operators/kernel/dequant_add_bn_kernel.h
+++ b/src/operators/kernel/dequant_add_bn_kernel.h
@@ -14,24 +14,52 @@ limitations under the License. */
 #pragma once
-#ifdef FUSION_DEQUANT_ADD_BN_OP
+#include <algorithm>
+#include <cmath>
-#include "framework/operator.h"
+#include "common/types.h"
-#include "operators/op_param.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 namespace paddle_mobile {
 namespace operators {
+namespace math {
-template <typename DeviceType, typename T>
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-class FusionDequantAddBNKernel
+template <ActivationType Act = Linear>
-    : public framework::OpKernelBase<DeviceType,
+inline float32x4_t vActiveq_f32(const float32x4_t &x) {
-                                     FusionDequantAddBNParam<DeviceType>> {
+  return x;
- public:
+}
-  void Compute(const FusionDequantAddBNParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNParam<DeviceType> *param);
-};
-}  // namespace operators
+template <>
-}  // namespace paddle_mobile
+inline float32x4_t vActiveq_f32<Relu>(const float32x4_t &x) {
+  float32x4_t __zero = vdupq_n_f32(0.f);
+  return vmaxq_f32(x, __zero);
+}
+template <>
+inline float32x4_t vActiveq_f32<Relu6>(const float32x4_t &x) {
+  float32x4_t __zero = vdupq_n_f32(0.f);
+  float32x4_t __six = vdupq_n_f32(6.f);
+  return vminq_f32(vmaxq_f32(x, __zero), __six);
+}
 #endif
+template <ActivationType Act = Linear>
+inline float Active(const float &x) {
+  return x;
+}
+template <>
+inline float Active<Relu>(const float &x) {
+  return std::max(x, 0.f);
+}
+template <>
+inline float Active<Relu6>(const float &x) {
+  return std::min(std::max(x, 0.f), 6.f);
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/quantize.h
+++ b/src/operators/math/quantize.h
@@ -50,31 +50,31 @@ inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int32x4_t vround_f32(float32x4_t r) {
+inline int32x4_t vRoundq_f32(const float32x4_t &x) {
-  return vcvtq_s32_f32(r);
+  return vcvtq_s32_f32(x);
 }
 template <>
-inline int32x4_t vround_f32<ROUND_NEAREST_AWAY_ZERO>(float32x4_t r) {
+inline int32x4_t vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(const float32x4_t &x) {
  float32x4_t plus = vdupq_n_f32(0.5);
  float32x4_t minus = vdupq_n_f32(-0.5);
  float32x4_t zero = vdupq_n_f32(0);
-  uint32x4_t more_than_zero = vcgtq_f32(r, zero);
+  uint32x4_t more_than_zero = vcgtq_f32(x, zero);
  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
-  temp = vaddq_f32(r, temp);
+  temp = vaddq_f32(x, temp);
  int32x4_t ret = vcvtq_s32_f32(temp);
  return ret;
 }
 template <>
-inline int32x4_t vround_f32<ROUND_NEAREST_TO_EVEN>(float32x4_t r) {
+inline int32x4_t vRoundq_f32<ROUND_NEAREST_TO_EVEN>(const float32x4_t &x) {
  float32x4_t point5 = vdupq_n_f32(0.5);
  int32x4_t one = vdupq_n_s32(1);
  int32x4_t zero = vdupq_n_s32(0);
-  int32x4_t rnd = vround_f32<ROUND_NEAREST_AWAY_ZERO>(r);
+  int32x4_t rnd = math::vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(x);
  float32x4_t frnd = vcvtq_f32_s32(rnd);
-  frnd = vsubq_f32(frnd, r);
+  frnd = vsubq_f32(frnd, x);
  frnd = vabsq_f32(frnd);
  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
  int32x4_t abs_rnd = vabsq_s32(rnd);

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2530,7 +2530,7 @@ class QuantizeParam : public OpParam {
    // scale = max(abs(x))
    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
    // offline
-    if (OpParam::HasAttr("InScale", attrs)) {
+    if (inputs.count("InScale")) {
      offline_ = true;
      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, scope);
    }
@@ -2566,9 +2566,7 @@ class DequantizeParam : public OpParam {
  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                  const AttributeMap &attrs, const Scope &scope) {
    input_ = InputXFrom<GType>(inputs, scope);
-    if (outputs.count("Out")) {
+    output_ = OutFrom<GType>(outputs, scope);
-      output_ = OutFrom<GType>(outputs, scope);
-    }
    activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, scope);
    // dequantization is performed as x = x / static_scale / online_scale
    if (OpParam::HasAttr("weight_scale", attrs)) {
@@ -2588,10 +2586,10 @@ class DequantizeParam : public OpParam {
 };
 #endif
-#if defined(FUSION_DEQUANT_ADD_BN_OP) ||                                   \
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                              \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
-    defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_BN_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                             \
+    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
 template <typename Dtype>
 class FusionDequantBNParam : public DequantizeParam<Dtype> {
@@ -2609,10 +2607,6 @@ class FusionDequantBNParam : public DequantizeParam<Dtype> {
    bn_scale_ = OpParam::GetVarValue<GType>("BNScale", inputs, scope);
    bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, scope);
    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    // output
-    if (outputs.count("Y")) {
-      this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
-    }
  }
 public:
@@ -2642,10 +2636,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
    // element wise add params
    axis_ = OpParam::GetAttr<int>("axis", attrs);
    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
-    // output
-    if (outputs.count("Y")) {
-      this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
-    }
  }
 public:
@@ -2655,44 +2645,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
 };
 #endif
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <typename Dtype>
-class FusionDequantBNReluParam : public FusionDequantBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
- public:
-  FusionDequantBNReluParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, const Scope &scope)
-      : FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // output
-    if (outputs.count("Out")) {
-      this->output_ = OpParam::OutFrom<GType>(outputs, scope);
-    }
-  }
-};
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <typename Dtype>
-class FusionDequantAddBNReluParam : public FusionDequantAddBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
- public:
-  FusionDequantAddBNReluParam(const VariableNameMap &inputs,
-                              const VariableNameMap &outputs,
-                              const AttributeMap &attrs, const Scope &scope)
-      : FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // output
-    if (outputs.count("Out")) {
-      this->output_ = OpParam::OutFrom<GType>(outputs, scope);
-    }
-  }
-};
-#endif
 #ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
 template <typename Dtype>
 class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
@@ -2707,47 +2659,9 @@ class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
    // scale output
    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
    // offline
-    if (OpParam::HasAttr("static_scale", attrs)) {
+    if (inputs.count("InScale")) {
-      is_static_ = true;
+      offline_ = true;
-      static_scale_ = OpParam::GetAttr<float>("static_scale", attrs);
+      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, scope);
-    }
-    // x = round(scale * x)
-    if (OpParam::HasAttr("round_type", attrs)) {
-      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
-    }
-  }
- public:
-  RType *online_scale_;
-  // if static scale or not
-  bool is_static_ = false;
-  // quantize scale
-  float static_scale_ = 1.0f;
-  // round method type
-  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
-};
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-template <typename Dtype>
-class FusionDequantAddBNReluQuantParam
-    : public FusionDequantAddBNReluParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
- public:
-  FusionDequantAddBNReluQuantParam(const VariableNameMap &inputs,
-                                   const VariableNameMap &outputs,
-                                   const AttributeMap &attrs,
-                                   const Scope &scope)
-      : FusionDequantAddBNReluParam<Dtype>(inputs, outputs, attrs, scope) {
-    // scale output
-    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
-    // offline
-    if (OpParam::HasAttr("static_scale", attrs)) {
-      is_static_ = true;
-      static_scale_ = OpParam::GetAttr<float>("static_scale", attrs);
    }
    // x = round(scale * x)
    if (OpParam::HasAttr("round_type", attrs)) {
@@ -2757,10 +2671,10 @@ class FusionDequantAddBNReluQuantParam
 public:
  RType *online_scale_;
-  // if static scale or not
+  // quantize offline scale
-  bool is_static_ = false;
+  RType *offline_scale_;
-  // quantize scale
+  // if offine scale or not
-  float static_scale_ = 1.0f;
+  bool offline_ = false;
  // round method type
  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;

--- a/test/operators/test_conv_op.cpp
+++ b/test/operators/test_conv_op.cpp
@@ -129,7 +129,8 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
 }
 template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
+int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
+               int groups) {
  int kernel_h = Kernel;
  int kernel_w = Kernel;
  int pad_h = Pad;
@@ -147,7 +148,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  framework::DDim input_shape =
      framework::make_ddim({batch_size, input_c, input_h, input_w});
  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
+      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
  VariableNameMap inputs;
  VariableNameMap outputs;
@@ -164,13 +165,22 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
  SetupTensor<Itype>(filter, filter_shape, -20, 20);
+  for (int i = 0; i < input->numel(); ++i) {
+    DLOG << "input[" << i
+         << "] = " << static_cast<int>(input->data<int8_t>()[i]);
+  }
+  for (int i = 0; i < filter->numel(); ++i) {
+    DLOG << "filter[" << i
+         << "] = " << static_cast<int>(filter->data<int8_t>()[i]);
+  }
  auto output_var = scope.get()->Var("output");
  framework::AttributeMap attrs;
  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
  attrs["dilations"].Set<vector<int>>(
      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(1);
+  attrs["groups"].Set<int>(groups);
  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
                                               scope);
@@ -204,15 +214,15 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  Otype *output_cmp_data = output_cmp.data<Otype>();
  for (int i = 0; i < output->numel(); ++i) {
    float gap = output_data[i] - output_cmp_data[i];
-    PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
+    //    PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
-                          "output[%d] = %d, output_cmp[%d] = %d", i,
+    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
-                          output_data[i], i, output_cmp_data[i]);
+    //                          output_data[i], i, output_cmp_data[i]);
-    // if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
+    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-    //   LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-    //                  << ", output_cmp_data[" << i << "] = " <<
+                     << ", output_cmp_data[" << i
-    //                  output_cmp_data[i];
+                     << "] = " << output_cmp_data[i];
-    //   return 1;
+      exit(1);
-    // }
+    }
  }
  delete op;
  return 0;
@@ -224,7 +234,8 @@ int main(int argc, char *argv[]) {
  if (argc < 5) {
    LOG(paddle_mobile::kLOG_INFO)
        << "Usage:\n"
-        << "  ./test-int8-conv-op in_channels in_height in_width out_channels\n"
+        << "  ./test-int8-conv-op in_channels in_height in_width out_channels "
+           "[groups]\n"
        << "  params:\n"
        << "   -in_channels: int, input image's channels\n"
        << "   -in_height: int, input image's height\n"
@@ -236,72 +247,134 @@ int main(int argc, char *argv[]) {
  int in_height = atoi(argv[2]);
  int in_width = atoi(argv[3]);
  int out_channels = atoi(argv[4]);
-  // kernel = 3, pad = 1, stride = 1
+  int groups = 1;
-  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
+  if (argc == 6) {
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
+    groups = atoi(argv[5]);
-                                                   in_width, out_channels);
+  }
-  // kernel = 7, pad = 0, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 1, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 3, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 3, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 5, stride = 3
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 3, stride = 4
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 3, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
  // kernel = 3, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(
-                                                   in_width, out_channels);
+      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
  // kernel = 3, pad = 1, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(
-                                                   in_width, out_channels);
+      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 0, stride = 1
+  // kernel = 3, pad = 2, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=2, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels, in_height,
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 1>(
-                                                      in_width, out_channels);
+      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 0, stride = 1
+  // kernel = 3, pad = 5, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=5, stride=1";
-  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 1>(
-                                                   in_width, out_channels);
+      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 2, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
+  // kernel = 3, pad = 0, stride = 2
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels, in_height,
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=2";
-                                                      in_width, out_channels);
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 2>(
-  // kernel = 5, pad = 2, stride = 1
+      in_channels, in_height, in_width, out_channels, groups);
-  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
+  // kernel = 3, pad = 1, stride = 2
-  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=2";
-                                                   in_width, out_channels);
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 2, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=2, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 5, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=5, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+  //  // kernel = 7, pad = 0, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 1, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 3, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 1, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 3, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 5, stride = 3
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 3, stride = 4
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 3, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 3, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
+  //  // kernel = 3, pad = 1, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 3, pad = 1, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
+  //  // kernel = 5, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 5, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
+  //  // kernel = 5, pad = 2, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 5, pad = 2, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
 }
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -249,6 +249,7 @@ if(NOT FOUND_MATCH)
  set(SUM_OP ON)
  set(QUANT_OP ON)
  set(DEQUANT_OP ON)
+  set(FUSION_DEQUANT_BN_OP ON)
  set(FUSION_DEQUANT_ADD_BN_OP ON)
  set(FUSION_DEQUANT_BN_RELU_OP ON)
  set(FUSION_DEQUANT_ADD_BN_RELU_OP ON)
@@ -455,6 +456,9 @@ endif()
 if (DEQUANT_OP)
  add_definitions(-DDEQUANT_OP)
 endif()
+if (FUSION_DEQUANT_BN_OP)
+  add_definitions(-DFUSION_DEQUANT_BN_OP)
+endif()
 if (FUSION_DEQUANT_ADD_BN_OP)
  add_definitions(-DFUSION_DEQUANT_ADD_BN_OP)
 endif()