add fusion_conv_add_relu_int8_op and unit test.

a80b04b9 · Zhen Wang · ff8141ee · a80b04b9 · a80b04b9 · a80b04b9
17 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -24,6 +24,7 @@ const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
 const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
@@ -111,6 +112,7 @@ std::unordered_map<
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -99,6 +99,7 @@ extern const char *G_OP_TYPE_BOX_CODER;
 extern const char *G_OP_TYPE_CONCAT;
 extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
 extern const char *G_OP_TYPE_FC;

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -98,6 +98,24 @@ class OpRegistry {
  }
 };

+#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
+  template class op_class<device_type, int8_t>;                             \
+  template <typename Dtype, typename T>                                     \
+  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {    \
+   public:                                                                  \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);    \
+  };                                                                        \
+  static paddle_mobile::framework::OperatorRegistrar<                       \
+      device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
+      __op_registrar_##op_type##_##device_name(#op_type);                   \
+  int TouchOpRegistrar_##op_type##_##device_name() {                        \
+    __op_registrar_##op_type##_##device_name.Touch();                       \
+    return 0;                                                               \
+  }
+
+#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
+  REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
+
 #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
  template class op_class<device_type, float>;                             \
  template <typename Dtype, typename T>                                    \

--- a/src/operators/fusion_conv_add_relu_int8_op.cpp
+++ b/src/operators/fusion_conv_add_relu_int8_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+
+#include "operators/fusion_conv_add_relu_int8_op.h"
+#include <vector>
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
+                           ops::FusionConvAddReluInt8Op);
+#endif
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/fusion_conv_add_relu_int8_op.h
+++ b/src/operators/fusion_conv_add_relu_int8_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/conv_add_relu_int8_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class FusionConvAddReluInt8Op
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionConvAddReluInt8Param<DeviceType>,
+          operators::ConvAddReluInt8Kernel<DeviceType, T>> {
+ public:
+  FusionConvAddReluInt8Op(const string &type, const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const framework::AttributeMap &attrs,
+                          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddReluInt8Param<DeviceType>,
+            operators::ConvAddReluInt8Kernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/kernel/arm/conv_add_relu_int8_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_int8_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+
+#include "operators/kernel/conv_add_relu_int8_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluInt8Kernel<CPU, int8_t>::Init(
+    FusionConvAddReluInt8Param<CPU> *param) {
+  return true;
+}
+
+template <>
+void ConvAddReluInt8Kernel<CPU, int8_t>::Compute(
+    const FusionConvAddReluInt8Param<CPU> &param) {
+  ConvAddReluInt8Compute<int8_t>(param);
+}
+template class ConvAddReluInt8Kernel<CPU, int8_t>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -33,6 +33,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  int axis = param.Axis();
  Tensor *output = param.Output();
  float *biase_data = bias.data<float>();
+  output->mutable_data<P>();

  int groups = param.Groups();
  std::vector<int> strides = param.Strides();

--- a/src/operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvAddReluInt8Compute(const FusionConvAddReluInt8Param<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  Tensor scale = *param.InputScale();
+  int32_t axis = param.Axis();
+  Tensor *output = param.Output();
+  output->mutable_data<P>();
+
+  int32_t *biase_data = bias.data<int32_t>();
+  float scale_v = scale.data<float>()[0];
+
+  int32_t groups = param.Groups();
+  std::vector<int32_t> strides = param.Strides();
+  std::vector<int32_t> paddings = param.Paddings();
+  std::vector<int32_t> dilations = param.Dilations();
+
+  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<P>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
+  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, P> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
+
+  for (int32_t i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int32_t g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmul_int8(filter_slice, false, col_matrix, false, scale_v,
+                        &out_slice, static_cast<float>(0), true, biase_data);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/kernel/conv_add_relu_int8_kernel.h
+++ b/src/operators/kernel/conv_add_relu_int8_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+
+#pragma once
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddReluInt8Kernel
+    : public OpKernelBase<DeviceType, FusionConvAddReluInt8Param<DeviceType>> {
+ public:
+  void Compute(const FusionConvAddReluInt8Param<DeviceType> &param);
+  bool Init(FusionConvAddReluInt8Param<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -243,6 +243,9 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
 #endif  // __ARM_NEON
 }

+// The core idea of AddDot4x2 function is borrowed from the Google's gemmlowp
+// open source library. The address of gemmlowp is
+// https://github.com/google/gemmlowp.
 void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -437,7 +437,7 @@ class ConvParam : public OpParam {

 #endif

- private:
+ protected:
  RType *input_;
  mutable RType *output_;
  mutable RType *filter_;
@@ -1709,6 +1709,35 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
 };
 #endif

+#ifdef FUSION_CONVADDRELU_INT8_OP
+template <typename Dtype>
+class FusionConvAddReluInt8Param : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionConvAddReluInt8Param(const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+  }
+
+  const RType *InputScale() const { return scale_; }
+
+  RType *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+ protected:
+  RType *scale_;
+  RType *bias_;
+  int axis_;
+};
+#endif
+
 #ifdef FUSION_CONVADDPRELU_OP
 template <typename Dtype>
 class FusionConvAddPReluParam : public ConvParam<Dtype> {

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -324,6 +324,10 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-relu-op paddle-mobile)

+    # gen test
+    ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
+
    # gen test
    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -65,12 +65,19 @@ int32_t qadd_int32(int32_t l, int32_t r) {
    return static_cast<int32_t>(res);
 }

+// round to zero
+float round2zero(float v) {
+  float res;
+  if (v > 0)
+    res = std::floor(v);
+  else if (v < 0)
+    res = std::ceil(v);
+  return res;
+}
+
 int8_t qscale_int32(int32_t v, float scale) {
  float res = static_cast<float>(v) * scale;
-  if (res > 0)
-    res = std::floor(res);
-  else if (res < 0)
-    res = std::ceil(res);  // round to zero
+  res = round2zero(res);
  if (res > 127)
    return static_cast<int8_t>(127);
  else if (res < -127)
@@ -155,7 +162,7 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
  int lda = k;
  int ldb = n;
  int ldc = n;
-  float scale = 0.00628;
+  float scale = 0.00628f;
  default_random_engine e;
  uniform_int_distribution<int8_t> pixel(-127, 127);
  int8_t *a = static_cast<int8_t *>(

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -103,13 +103,13 @@ int main() {
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul_int8(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int8,
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), true, &bias_data[0]);
  }
  auto time5 = time();
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul_int8(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int8,
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), true, &bias_data[0]);
  }
  auto time6 = time();

--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/fusion_conv_add_relu_int8_op.h"
+
+namespace paddle_mobile {
+int32_t qadd_int32(int32_t l, int32_t r) {
+  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
+  if (res > INT_MAX)
+    return INT_MAX;
+  else if (res < INT_MIN)
+    return INT_MIN;
+  else
+    return static_cast<int32_t>(res);
+}
+
+// round to zero
+float round2zero(float v) {
+  float res;
+  if (v > 0)
+    res = std::floor(v);
+  else if (v < 0)
+    res = std::ceil(v);
+  return res;
+}
+
+int8_t qscale_int32(int32_t v, float scale) {
+  float res = static_cast<float>(v) * scale;
+  res = round2zero(res);
+  if (res > 127)
+    return static_cast<int8_t>(127);
+  else if (res < -127)
+    return static_cast<int8_t>(-127);
+  else
+    return static_cast<int8_t>(res);
+}
+
+// Reference convolution from Caffe for checking results.
+// accumulate through explicit loops over input, output, and filters.
+template <typename T>
+void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
+            const framework::Tensor *bias, const framework::AttributeMap &attrs,
+            framework::Tensor *output, float scale) {
+  framework::AttrReader attr_reader(attrs);
+  std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
+  std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
+  std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
+  int groups = attr_reader.Get<int>("groups");
+  int kernel_h = filter->dims()[2];
+  int kernel_w = filter->dims()[3];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
+  auto in_shape = input->dims();
+  auto out_shape = output->dims();
+
+  const bool has_depth = 0;
+  int kernel_d, pad_d, stride_d, dilation_d;
+  if (has_depth) {
+    kernel_d = kernel_h;
+    stride_d = stride_h;
+    pad_d = pad_h;
+    dilation_d = dilation_h;
+  } else {
+    kernel_d = stride_d = dilation_d = 1;
+    pad_d = 0;
+  }
+  // Groups
+  int o_g = out_shape[1] / groups;
+  int k_g = in_shape[1] / groups;
+  int o_head, k_head;
+  // Convolution
+  vector<int> weight_offset(4 + has_depth);
+  vector<int> in_offset(4 + has_depth);
+  vector<int> out_offset(4 + has_depth);
+  auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
+    framework::DDim shape = input->dims();
+    size_t count = 0;
+    for (int i = 0; i < indics.size(); ++i) {
+      count *= shape[i];
+      count += indics[i];
+    }
+    return count;
+  };
+
+  const T *in_data = input->data<T>();
+  const T *w_data = filter->data<T>();
+  framework::Tensor output_32;
+  int32_t *out_data_32 = output_32.mutable_data<int32_t>(out_shape);
+  memset(out_data_32, 0, output_32.numel() * sizeof(int32_t));
+  for (int n = 0; n < out_shape[0]; n++) {
+    for (int g = 0; g < groups; g++) {
+      o_head = o_g * g;
+      k_head = k_g * g;
+      for (int o = 0; o < o_g; o++) {
+        for (int k = 0; k < k_g; k++) {
+          for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
+            for (int y = 0; y < out_shape[2 + has_depth]; y++) {
+              for (int x = 0; x < out_shape[3 + has_depth]; x++) {
+                for (int r = 0; r < kernel_d; r++) {
+                  for (int p = 0; p < kernel_h; p++) {
+                    for (int q = 0; q < kernel_w; q++) {
+                      int in_z = z * stride_d - pad_d + r * dilation_d;
+                      int in_y = y * stride_h - pad_h + p * dilation_h;
+                      int in_x = x * stride_w - pad_w + q * dilation_w;
+                      if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
+                          in_y >= 0 && in_y < in_shape[2 + has_depth] &&
+                          in_x >= 0 && in_x < in_shape[3 + has_depth]) {
+                        weight_offset[0] = o + o_head;
+                        weight_offset[1] = k;
+                        if (has_depth) {
+                          weight_offset[2] = r;
+                        }
+                        weight_offset[2 + has_depth] = p;
+                        weight_offset[3 + has_depth] = q;
+                        in_offset[0] = n;
+                        in_offset[1] = k + k_head;
+                        if (has_depth) {
+                          in_offset[2] = in_z;
+                        }
+                        in_offset[2 + has_depth] = in_y;
+                        in_offset[3 + has_depth] = in_x;
+                        out_offset[0] = n;
+                        out_offset[1] = o + o_head;
+                        if (has_depth) {
+                          out_offset[2] = z;
+                        }
+                        out_offset[2 + has_depth] = y;
+                        out_offset[3 + has_depth] = x;
+
+                        out_data_32[offset(output, out_offset)] +=
+                            in_data[offset(input, in_offset)] *
+                            w_data[offset(filter, weight_offset)];
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  T *out_data = output->mutable_data<T>();
+  int32_t n = out_shape[0];
+  int32_t c = out_shape[1];
+  int32_t h = out_shape[2];
+  int32_t w = out_shape[3];
+  const int32_t *bias_data = bias->data<int32_t>();
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < c; ++j) {
+      int32_t bias_v = bias_data[j];
+      for (int k = 0; k < h; ++k) {
+        for (int l = 0; l < w; ++l) {
+          int32_t tmp = out_data_32[i * c * h * w + j * h * w + k * w + l];
+          tmp = qadd_int32(tmp, bias_v);
+          tmp = std::max(0, tmp);
+          out_data[i * c * h * w + j * h * w + k * w + l] =
+              qscale_int32(tmp, scale);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int Kernel, int Pad, int Stride>
+int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  int dilation_h = 1;
+  int dilation_w = 1;
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+  int output_c = out_channels;
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+  framework::DDim filter_shape =
+      framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
+
+  int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+  int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+  int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1;
+  int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1;
+  framework::DDim output_shape = framework::make_ddim(
+      std::vector<int>({batch_size, output_c, output_h, output_w}));
+
+  framework::DDim bias_shape = framework::make_ddim({output_c});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Filter"] = std::vector<std::string>({"filter"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  inputs["Y"] = std::vector<std::string>({"y"});
+  outputs["Output"] = std::vector<std::string>({"output"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(input, input_shape, -127, 127);
+
+  auto filter_var = scope.get()->Var("filter");
+  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(filter, filter_shape, -127, 127);
+
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  scale->Resize(framework::make_ddim({1}));
+  float scale_v = 0.000828f;
+  scale->mutable_data<float>()[0] = scale_v;
+
+  auto bias_var = scope.get()->Var("y");
+  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<int32_t>(bias, bias_shape, -127, 127);
+
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["dilations"].Set<vector<int>>(
+      std::vector<int>({dilation_h, dilation_w}));
+  attrs["groups"].Set<int>(1);
+  attrs["axis"].Set<int>(0);
+
+  auto *op = new operators::FusionConvAddReluInt8Op<CPU, int8_t>(
+      "fusion_conv_add_relu_int8", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+
+  framework::Tensor output_cmp;
+  output_cmp.mutable_data<T>(output_shape);
+  conv2d<T>(input, filter, bias, attrs, &output_cmp, scale_v);
+
+  // compare results
+  int eq = 0;
+  int neq = 0;
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const T *output_data = output->data<T>();
+  T *output_cmp_data = output_cmp.data<T>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(
+        output_data[i] == output_cmp_data[i],
+        "The execution of test_fusion_conv_add_relu_int8_op is failed!");
+    if (output_data[i] == output_cmp_data[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  if (argc < 5) {
+    LOG(paddle_mobile::kLOG_INFO)
+        << "Usage:\n"
+        << "  ./test-conv-add-relu-int8-op in_channels in_height in_width "
+           "out_channels\n"
+        << "  params:\n"
+        << "   -in_channels: int, input image's channels\n"
+        << "   -in_height: int, input image's height\n"
+        << "   -in_width: int, input image's width\n"
+        << "   -out_channels: int, conv output channels\n";
+    return 1;
+  }
+  int in_channels = atoi(argv[1]);
+  int in_height = atoi(argv[2]);
+  int in_width = atoi(argv[3]);
+  int out_channels = atoi(argv[4]);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8_t, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
+  paddle_mobile::TestConvOp<int8_t, 7, 0, 2>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 1, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
+  paddle_mobile::TestConvOp<int8_t, 7, 1, 2>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 3, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
+  paddle_mobile::TestConvOp<int8_t, 7, 3, 2>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 7, 0, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 7, 1, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 3, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 7, 3, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 5, stride = 3
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
+  paddle_mobile::TestConvOp<int8_t, 7, 5, 3>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 3, stride = 4
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
+  paddle_mobile::TestConvOp<int8_t, 7, 3, 4>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 3, 0, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+
+  // kernel = 5, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 5, 0, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+
+  // kernel = 5, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+}
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -79,14 +79,14 @@ int TestMulOP() {
    PADDLE_MOBILE_ENFORCE(
        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
-    if (static_cast<int>(output_data[i] == c[i])) {
+    if (output_data[i] == c[i]) {
      ++eq;
    } else {
      ++neq;
    }
  }
-  DLOG << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
-       << " neq=" << neq;
+  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+            << " neq=" << neq << std::endl;
  delete op;
  return 0;
 }

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -213,6 +213,7 @@ if(NOT FOUND_MATCH)
  set(FUSION_CONVADD_OP ON)
  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDRELU_INT8_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
  set(MUL_OP ON)
@@ -306,6 +307,9 @@ endif()
 if (FUSION_CONVADDRELU_OP)
  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
+if (FUSION_CONVADDRELU_INT8_OP)
+  add_definitions(-DFUSION_CONVADDRELU_INT8_OP)
+endif()
 if (FUSION_CONVADDPRELU_OP)
  add_definitions(-DFUSION_CONVADDPRELU_OP)
 endif()