Merge branch 'develop' into fusion_conv_add_relu_int8_op

f165d3e9 · Ray Liu · GitHub · 262b72cd · 067eaef4 · f165d3e9
20 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -72,6 +72,8 @@ const char *G_OP_TYPE_SUM = "sum";

 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
+
 const char *G_OP_TYPE_TANH = "tanh";
 const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
 const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
@@ -136,6 +138,7 @@ std::unordered_map<
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -139,6 +139,7 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;

 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;

 extern const char *G_OP_TYPE_TANH;
 extern const char *G_OP_TYPE_FUSION_DECONV_RELU;

--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -132,11 +132,11 @@ void format_concat_output(framework::Tensor *out, int height, int width,
 }

 int format_conv_data(framework::Tensor *filter_tensor,
-                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
+                     framework::Tensor *ofm_tensor, float **bs_ptr, int group) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_filter(filter_tensor, max_value, group);
  int aligned_num = get_aligned_filter_num(filter_tensor);
-  fpga::format_bias_scale_array(&bs_ptr,
+  fpga::format_bias_scale_array(bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -39,7 +39,7 @@ void format_bias_scale_array(float** bias_scale_array, int filter_num,
 void format_concat_output(framework::Tensor* out, int height, int width,
                          uint32_t out_channel);
 int format_conv_data(framework::Tensor* filter_tensor,
-                     framework::Tensor* ofm_tensor, float* bs_ptr, int group);
+                     framework::Tensor* ofm_tensor, float** bs_ptr, int group);
 int format_fc_data(framework::Tensor* filter_tensor,
                   framework::Tensor* ofm_tensor, float* bs_ptr);
 void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -233,3 +233,7 @@ LOAD_OP1(quantize, CPU);
 #ifdef DEQUANT_OP
 LOAD_OP1(dequantize, CPU);
 #endif
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
+#endif
--- a/src/operators/fusion_dequant_add_bn_relu_op.cpp
+++ b/src/operators/fusion_dequant_add_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+
+#include "operators/fusion_dequant_add_bn_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionDequantAddBNReluOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu,
+                        ops::FusionDequantAddBNReluMatcher);
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu,
+                      ops::FusionDequantAddBNReluOp);
+#endif
+
+#endif
--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_add_bn_relu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantAddBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDequantAddBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+          operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDequantAddBNReluOp(const std::string &type,
+                           const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const framework::AttributeMap &attrs,
+                           std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+
+#include "operators/kernel/dequant_add_bn_relu_kernel.h"
+#include <cmath>
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionDequantAddBNReluKernel<CPU, float>::Init(
+    FusionDequantAddBNReluParam<CPU> *param) {
+  // elementwise add params
+  const Tensor *bias = param->bias_;
+  // batch norm params
+  const Tensor *bn_mean = param->bn_mean_;
+  const Tensor *bn_variance = param->bn_variance_;
+  Tensor *bn_scale = param->bn_scale_;
+  Tensor *bn_bias = param->bn_bias_;
+  const float epsilon = param->epsilon_;
+
+  const float *bias_ptr = bias->data<float>();
+  const float *mean_ptr = bn_mean->data<float>();
+  const float *var_ptr = bn_variance->data<float>();
+  float *bn_scale_ptr = bn_scale->mutable_data<float>();
+  float *bn_bias_ptr = bn_bias->mutable_data<float>();
+  for (int c = 0; c < bn_scale->numel(); ++c) {
+    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
+    bn_scale_ptr[c] = inv_scale;
+    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
+  }
+  return true;
+}
+
+template <>
+void FusionDequantAddBNReluKernel<CPU, float>::Compute(
+    const FusionDequantAddBNReluParam<CPU> &param) {
+  const int32_t *input = param.input_->data<int32_t>();
+  const float *bn_scale = param.bn_scale_->data<float>();
+  const float *bn_bias = param.bn_bias_->data<float>();
+  // dequantize params
+  const float activation_scale = param.activation_scale_->data<float>()[0];
+  const float weight_scale = param.weight_scale_;
+  const float dequant_scale = activation_scale / weight_scale;
+
+  float *output = param.output_->mutable_data<float>();
+  int batch_size = param.input_->dims()[0];
+  int channels = param.input_->dims()[1];
+  size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
+
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int c = 0; c < channels; ++c) {
+      float scale = bn_scale[c] * dequant_scale;
+      float bias = bn_bias[c];
+      size_t offset = (batch * channels + c) * spatial_size;
+      const int32_t *x = input + offset;
+      float *y = output + offset;
+      size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      int loop = spatial_size >> 4;
+      remain = spatial_size & 0xF;
+      float32x4_t __scale = vdupq_n_f32(scale);
+      float32x4_t __bias = vdupq_n_f32(bias);
+      float32x4_t __zero = vdupq_n_f32(0.f);
+
+      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+        int32x4_t r0 = vld1q_s32(x);
+        int32x4_t r1 = vld1q_s32(x + 4);
+        int32x4_t r2 = vld1q_s32(x + 8);
+        int32x4_t r3 = vld1q_s32(x + 12);
+        float32x4_t f0 = vcvtq_f32_s32(r0);
+        float32x4_t f1 = vcvtq_f32_s32(r1);
+        float32x4_t f2 = vcvtq_f32_s32(r2);
+        float32x4_t f3 = vcvtq_f32_s32(r3);
+        f0 = vmlaq_f32(__bias, __scale, f0);
+        f1 = vmlaq_f32(__bias, __scale, f1);
+        f2 = vmlaq_f32(__bias, __scale, f2);
+        f3 = vmlaq_f32(__bias, __scale, f3);
+        f0 = vmaxq_f32(__zero, f0);
+        f1 = vmaxq_f32(__zero, f1);
+        f2 = vmaxq_f32(__zero, f2);
+        f3 = vmaxq_f32(__zero, f3);
+        vst1q_f32(y, f0);
+        vst1q_f32(y + 4, f1);
+        vst1q_f32(y + 8, f2);
+        vst1q_f32(y + 12, f3);
+      }
+#endif  // __ARM_NEON__
+      for (int k = 0; k < remain; ++k) {
+        y[k] = std::max(scale * x[k] + bias, 0.f);
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -379,8 +379,8 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
        const float *x3 = input3 + h * input_w;
        int loop = input_w >> 4;
        int remain = input_w & 0xF;
-        int pad_loop = paddings[1] >> 1;
-        int pad_remain = paddings[1] & 0x1;
+        int pad_loop = paddings[1] >> 1;  // (paddings[1] << 1) >> 2
+        int pad_remain = (paddings[1] << 1) & 0x3;
        int remain_steps = remain;
        asm volatile(
            "vdup.f32   q0, %[scale]        \n"
@@ -596,7 +596,7 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,

            "store_pad_2w_%=:               \n"
            "cmp        %[pad_remain], #2   \n"
-            "ble        store_pad_1w_%=     \n"
+            "blt        store_pad_1w_%=     \n"
            "vst1.16    {d0[0]}, [%[y0]]!   \n"
            "vst1.16    {d0[0]}, [%[y1]]!   \n"
            "vst1.16    {d0[0]}, [%[y2]]!   \n"
@@ -605,7 +605,7 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,

            "store_pad_1w_%=:               \n"
            "cmp        %[pad_remain], #1   \n"
-            "ble        end_%=              \n"
+            "blt        end_%=              \n"
            "vst1.8    {d0[0]}, [%[y0]]!    \n"
            "vst1.8    {d0[0]}, [%[y1]]!    \n"
            "vst1.8    {d0[0]}, [%[y2]]!    \n"
@@ -669,8 +669,8 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
        const float *x0 = input0 + h * input_w;
        int loop = input_w >> 4;
        int remain = input_w & 0xF;
-        int pad_loop = paddings[1] >> 1;
-        int pad_remain = paddings[1] & 0x1;
+        int pad_loop = paddings[1] >> 1;  // (paddings[1] << 1) >> 2
+        int pad_remain = (paddings[1] << 1) & 0x3;
        asm volatile(
            "vdup.f32   q0, %[scale]        \n"
            "cmp        %[loop], #0         \n"
@@ -754,14 +754,14 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,

            "pad_remain_%=:                 \n"
            "cmp        %[pad_remain], #2   \n"
-            "ble        store_pad_1w_%=     \n"
+            "blt        store_pad_1w_%=     \n"
            "vst1.16    {d0[0]}, [%[y0]]!   \n"
            "sub        %[pad_remain], #2   \n"

            "store_pad_1w_%=:               \n"
            "cmp        %[pad_remain], #1   \n"
-            "ble        end_%=              \n"
-            "vst1.8    {d0[0]}, [%[y0]]!    \n"
+            "blt        end_%=              \n"
+            "vst1.8     {d0[0]}, [%[y0]]!   \n"
            "end_%=:                        \n"
            : [x0] "+r"(x0), [y0] "+r"(y0), [loop] "+r"(loop),
              [remain] "+r"(remain), [pad_loop] "+r"(pad_loop),
@@ -795,10 +795,10 @@ void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
  // only support int8 currently
  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
-  //  const auto &paddings = param.paddings_;
-  std::vector<int> paddings = {0, 0};
-  //  const auto padding_val = param.padding_val_;
-  int8_t padding_val = 127;
+  const auto &paddings = param.paddings_;
+  // std::vector<int> paddings = {0, 0};
+  // const auto padding_val = param.padding_val_;
+  int8_t padding_val = 0;
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
      quantize_round_to_even(input, scale, paddings, padding_val, output);

--- a/src/operators/kernel/dequant_add_bn_relu_kernel.h
+++ b/src/operators/kernel/dequant_add_bn_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class FusionDequantAddBNReluKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     FusionDequantAddBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
+  bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -58,7 +58,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -56,7 +56,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
@@ -38,7 +38,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }

-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
@@ -38,7 +38,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }

-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
@@ -50,7 +50,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP

 #include "operators/kernel/conv_bn_relu_kernel.h"
+#include "fpga/V2/filter.h"

 namespace paddle_mobile {
 namespace operators {
@@ -50,7 +51,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2555,7 +2555,7 @@ class QuantizeParam : public OpParam {
    output_ = OutFrom<GType>(outputs, scope);
    // online
    // scale = max(abs(x))
-    online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
+    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
    // offline
    if (HasAttr("static_scale", attrs)) {
      is_static_ = true;
@@ -2565,6 +2565,11 @@ class QuantizeParam : public OpParam {
    if (HasAttr("round_type", attrs)) {
      round_type_ = GetAttr<RoundType>("round_type", attrs);
    }
+    // get paddings
+    paddings_ = std::vector<int>({0, 0});
+    if (HasAttr("paddings", attrs)) {
+      paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    }
  }

 public:
@@ -2598,7 +2603,7 @@ class DequantizeParam : public OpParam {
                  const AttributeMap &attrs, const Scope &scope) {
    input_ = InputXFrom<GType>(inputs, scope);
    output_ = OutFrom<GType>(outputs, scope);
-    activation_scale_ = GetVarValue<GType>("Scale", inputs, scope);
+    activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, scope);
    // dequantization is performed as x = x / static_scale / online_scale
    if (HasAttr("weight_scale", attrs)) {
      weight_scale_ = GetAttr<float>("weight_scale", attrs);
@@ -2617,5 +2622,44 @@ class DequantizeParam : public OpParam {
 };
 #endif

+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+template <typename Dtype>
+class FusionDequantAddBNReluParam : public DequantizeParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDequantAddBNReluParam(const VariableNameMap &inputs,
+                              const VariableNameMap &outputs,
+                              const AttributeMap &attrs, const Scope &scope)
+      : DequantizeParam<Dtype>(inputs, outputs, attrs, scope) {
+    // element wise add params
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    // batch norm params
+    bn_mean_ = OpParam::GetVarValue<GType>("BNMean", inputs, scope);
+    bn_variance_ = OpParam::GetVarValue<GType>("BNVariance", inputs, scope);
+    bn_scale_ = OpParam::GetVarValue<GType>("BNScale", inputs, scope);
+    bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    // output
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+  }
+
+ public:
+  // elementwise add
+  int axis_;
+  RType *bias_;
+  // batch norm
+  RType *bn_mean_;
+  RType *bn_variance_;
+  RType *bn_scale_;
+  RType *bn_bias_;
+  float epsilon_;
+  // output
+  RType *output_;
+};
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
@@ -22,7 +22,10 @@ namespace operators {

 template <typename DeviceType, typename T>
 void QuantizeOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_->dims();
+  auto input_dims = this->param_.input_->dims();
+  const std::vector<int> &paddings = this->param_.paddings_;
+  input_dims[2] += 2 * paddings[0];
+  input_dims[3] += 2 * paddings[1];
  this->param_.output_->Resize(input_dims);
  auto scale_dims = framework::make_ddim(std::vector<int>{1});
  this->param_.online_scale_->Resize(scale_dims);

--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
@@ -12,58 +12,131 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/quantize_op.h"

 namespace paddle_mobile {
-
-static float find_abs_max(const Tensor *input) {
-  float max_abs = 0.f;
-  const float *x = input->data<const float>();
-  size_t size = input->numel();
-  for (size_t i = 0; i < size; ++i) {
-    float value = std::abs(x[i]);
-    if (value > max_abs) {
-      max_abs = value;
-    }
-  }
-  return max_abs;
+namespace round {
+enum RoundType {
+  RoundToEven = 0,
+  RoundAwayZero = 1,
+  RoundTowardsZero = 2,
+};
 }

-static void quantize_round_to_even(const Tensor *input, const float scale,
-                                   Tensor *output) {
-  const float *x = input->data<const float>();
-  int8_t *y = output->mutable_data<int8_t>();
-  size_t size = input->numel();
-  for (size_t i = 0; i < size; ++i) {
-    float value = x[i] * scale;
-    float v = round(value);
+template <round::RoundType T>
+struct Round {
+  int8_t operator()(float x);
+};
+
+template <>
+struct Round<round::RoundAwayZero> {
+  int8_t operator()(float x) { return std::round(x); }
+};
+
+template <>
+struct Round<round::RoundTowardsZero> {
+  int8_t operator()(float x) { return int8_t(x); }
+};
+
+template <>
+struct Round<round::RoundToEven> {
+  int8_t operator()(float x) {
+    int8_t ret = 0;
+    float v = std::round(x);
    int32_t q = (int32_t)v;
-    if (abs(abs(q - value) - 0.5) > 0) {
-      y[i] = q;
+    if (abs(abs(q - x) - 0.5) > 0) {
+      ret = q;
    } else {
      if (abs(q) % 2 == 0) {
-        y[i] = q;
+        ret = q;
      } else {
-        y[i] = q + ((q > 0) ? -1 : 1);
+        ret = q + ((q > 0) ? -1 : 1);
+      }
+    }
+    return ret;
+  }
+};
+
+template <round::RoundType T>
+static void quantize(const Tensor *input, const float scale, const int pad,
+                     const int8_t pad_val, Tensor *output) {
+  int batch_size = input->dims()[0];
+  int channels = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+  size_t input_spatial = input_h * input_w;
+  size_t output_spatial = output_h * output_w;
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+
+  for (int nc = 0; nc < batch_size * channels; ++nc) {
+    const float *xh = x + nc * input_spatial;
+    int8_t *yh = y + nc * output_spatial;
+    // pad top
+    for (int h = 0; h < pad; ++h, yh += output_w) {
+      for (int w = 0; w < output_w; ++w) {
+        yh[w] = pad_val;
+      }
+    }
+    for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) {
+      // pad left
+      for (int w = 0; w < pad; ++w) {
+        yh[w] = pad_val;
+      }
+      for (int w = 0; w < input_w; ++w) {
+        yh[w + pad] = Round<T>()(xh[w] * scale);
+      }
+      // pad right
+      for (int w = 0; w < pad; ++w) {
+        yh[pad + input_w + w] = pad_val;
+      }
+    }
+    // pad bottom
+    for (int h = 0; h < pad; ++h, yh += output_w) {
+      for (int w = 0; w < output_w; ++w) {
+        yh[w] = pad_val;
      }
    }
  }
 }

-static void quantize_round_to_nearest(const Tensor *input, const float scale,
-                                      Tensor *output) {
+static float find_abs_max(const Tensor *input) {
+  float max_abs = 0.f;
  const float *x = input->data<const float>();
-  int8_t *y = output->mutable_data<int8_t>();
  size_t size = input->numel();
  for (size_t i = 0; i < size; ++i) {
-    y[i] = round(x[i] * scale);
+    float value = std::abs(x[i]);
+    if (value > max_abs) {
+      max_abs = value;
+    }
  }
+  return max_abs;
 }

-int TestQuqntizeOp() {
-  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
+int TestQuqntizeOp(int argc, char *argv[]) {
+  if (argc < 5) {
+    std::cout
+        << "Usage: ./test-quantize-op batch_size channel height width [pad]"
+        << std::endl;
+    return 1;
+  }
+  int pad = 0;
+  int batch_size = atoi(argv[1]);
+  int channel = atoi(argv[2]);
+  int height = atoi(argv[3]);
+  int width = atoi(argv[4]);
+  if (argc == 6) {
+    pad = atoi(argv[5]);
+  }
+  std::cout << "batch_size: " << batch_size << ", channel: " << channel
+            << ", height: " << height << ", width: " << width << std::endl;
+  framework::DDim dim =
+      framework::make_ddim({batch_size, channel, height, width});

  VariableNameMap inputs;
  VariableNameMap outputs;
@@ -80,6 +153,7 @@ int TestQuqntizeOp() {
  auto output_scale_var = scope.get()->Var("output_scale");

  framework::AttributeMap attrs;
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad, pad}));
  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
                                                   attrs, scope);
  op->InferShape();
@@ -96,10 +170,11 @@ int TestQuqntizeOp() {
                        output_scale_cmp, output_scale_data[0]);

  framework::Tensor output_cmp;
-  output_cmp.Resize(dim);
+  output_cmp.Resize(output->dims());
  float scale = 127 / output_scale_cmp;
-  // quantize_round_to_even(input, scale, &output_cmp);
-  quantize_round_to_nearest(input, scale, &output_cmp);
+  // quantize<round::RoundToEven>(input, scale, pad, 0, &output_cmp);
+  // quantize<round::RoundAwayZero>(input, scale, pad, 0, &output_cmp);
+  quantize<round::RoundTowardsZero>(input, scale, pad, 0, &output_cmp);
  int8_t *output_cmp_data = output_cmp.data<int8_t>();
  for (int i = 0; i < output->numel(); ++i) {
    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
@@ -113,4 +188,6 @@ int TestQuqntizeOp() {

 }  // namespace paddle_mobile

-int main() { return paddle_mobile::TestQuqntizeOp(); }
+int main(int argc, char *argv[]) {
+  return paddle_mobile::TestQuqntizeOp(argc, argv);
+}
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -250,6 +250,7 @@ if(NOT FOUND_MATCH)
  set(SUM_OP ON)
  set(QUANT_OP ON)
  set(DEQUANT_OP ON)
+  set(FUSION_DEQUANT_ADD_BN_RELU ON)
 endif()

  # option(BATCHNORM_OP "" ON)
@@ -454,6 +455,9 @@ endif()
 if (DEQUANT_OP)
  add_definitions(-DDEQUANT_OP)
 endif()
+if (FUSION_DEQUANT_ADD_BN_RELU)
+  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP)
+endif()

 if (TANH_OP)
  add_definitions(-DTANH_OP)
@@ -466,4 +470,4 @@ if (FUSION_DECONVADD_OP)
 endif()
 if (FUSION_DECONVADDRELU_OP)
  add_definitions(-DFUSION_DECONVADDRELU_OP)
-endif()
\ No newline at end of file
+endif()