update the code style according suggestions.

f35cd850 · ZhenWang · 36607770 · f35cd850 · f35cd850 · f35cd850
17 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ endif()
 if(DEBUGING)
    message(STATUS "debugging mode")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+#    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
 endif()

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -114,7 +114,7 @@ std::unordered_map<
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -153,7 +153,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
  paddle_mobile::operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();
  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-             static_cast<float>(0), c, ldc, false, nullptr);
+             static_cast<float>(0), c, ldc, false,
+             static_cast<float *>(nullptr));
  auto time2 = paddle_mobile::time();
  double cost = paddle_mobile::time_diff(time1, time2);
  paddle_mobile::memory::Free(a);

--- a/src/operators/fusion_conv_add_relu_int8_op.h
+++ b/src/operators/fusion_conv_add_relu_int8_op.h
@@ -16,28 +16,26 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "framework/operator.h"
-#include "operators/kernel/conv_add_relu_int8_kernel.h"
+#include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-using std::string;
 template <typename DeviceType, typename T>
 class FusionConvAddReluInt8Op
    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddReluInt8Param<DeviceType>,
+          DeviceType, FusionConvAddReluParam<DeviceType>,
-          operators::ConvAddReluInt8Kernel<DeviceType, T>> {
+          operators::ConvAddReluKernel<DeviceType, T>> {
 public:
-  FusionConvAddReluInt8Op(const string &type, const VariableNameMap &inputs,
+  FusionConvAddReluInt8Op(const std::string &type,
+                          const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const framework::AttributeMap &attrs,
                          std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddReluInt8Param<DeviceType>,
+            DeviceType, FusionConvAddReluParam<DeviceType>,
-            operators::ConvAddReluInt8Kernel<DeviceType, T>>(
+            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
-            type, inputs, outputs, attrs, scope) {}
+                                                         attrs, scope) {}
  void InferShape() const override;
- protected:
 };
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/conv_add_relu_int8_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_int8_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDRELU_INT8_OP
-#include "operators/kernel/conv_add_relu_int8_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddReluInt8Kernel<CPU, int8_t>::Init(
-    FusionConvAddReluInt8Param<CPU> *param) {
-  return true;
-}
-template <>
-void ConvAddReluInt8Kernel<CPU, int8_t>::Compute(
-    const FusionConvAddReluInt8Param<CPU> &param) {
-  ConvAddReluInt8Compute<int8_t>(param);
-}
-template class ConvAddReluInt8Kernel<CPU, int8_t>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -28,10 +28,24 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
 template <>
 void ConvAddReluKernel<CPU, float>::Compute(
    const FusionConvAddReluParam<CPU> &param) {
-  ConvAddReluCompute<float>(param);
+  ConvAddReluCompute<float, float>(param);
 }
 template class ConvAddReluKernel<CPU, float>;
+#ifdef FUSION_CONVADDRELU_INT8_OP
+template <>
+bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
+  return true;
+}
+template <>
+void ConvAddReluKernel<CPU, int8_t>::Compute(
+    const FusionConvAddReluParam<CPU> &param) {
+  ConvAddReluCompute<int8_t, int32_t>(param);
+}
+template class ConvAddReluKernel<CPU, int8_t>;
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -25,22 +25,31 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-template <typename P>
+template <typename P, typename S>
 void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
-  int axis = param.Axis();
+  int32_t axis = param.Axis();
+  S *bias_data = bias.data<S>();
  Tensor *output = param.Output();
-  float *biase_data = bias.data<float>();
  output->mutable_data<P>();
-  int groups = param.Groups();
+  float alpha = 1.0f;
-  std::vector<int> strides = param.Strides();
+  float beta = 1.0f;
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  const int batch_size = static_cast<int>(input->dims()[0]);
+#ifdef FUSION_CONVADDRELU_INT8_OP
+  Tensor scale = *param.InputScale();
+  alpha = scale.data<float>()[0];
+  beta = 0.0f;
+#endif
+  int32_t groups = param.Groups();
+  std::vector<int32_t> strides = param.Strides();
+  std::vector<int32_t> paddings = param.Paddings();
+  std::vector<int32_t> dilations = param.Dilations();
+  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
@@ -62,13 +71,13 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<float>(col_shape);
+    col.mutable_data<P>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
+      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
@@ -78,17 +87,17 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      output->numel() / (output->dims()[0] * output->dims()[1])};
  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Vol2ColFunctor<CPU, P> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
-  for (int i = 0; i < batch_size; i++) {
+  for (int32_t i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
+    for (int32_t g = 0; g < groups; g++) {
      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
      if (!is_expand) {
@@ -98,8 +107,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      } else if (data_dim == 2U) {
        // im2col
        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
+               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
+                                    paddings[1]},
               &col);
      } else if (data_dim == 3U) {
        // vol2col
@@ -109,9 +118,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
+      math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
-                          static_cast<float>(1), true, biase_data);
+                   beta, true, bias_data);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_int8_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDRELU_INT8_OP
-#pragma once
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void ConvAddReluInt8Compute(const FusionConvAddReluInt8Param<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor scale = *param.InputScale();
-  int32_t axis = param.Axis();
-  Tensor *output = param.Output();
-  output->mutable_data<P>();
-  int32_t *bias_data = bias.data<int32_t>();
-  float scale_v = scale.data<float>()[0];
-  int32_t groups = param.Groups();
-  std::vector<int32_t> strides = param.Strides();
-  std::vector<int32_t> paddings = param.Paddings();
-  std::vector<int32_t> dilations = param.Dilations();
-  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<P>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
-  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, P> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
-  for (int32_t i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int32_t g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul(filter_slice, false, col_matrix, false, scale_v, &out_slice,
-                   static_cast<float>(0), true, bias_data);
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -106,16 +106,9 @@ inline void GemmConv(const ConvParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul(filter_slice, false, col_matrix, false,
-      if (param.Input()->type() == typeid(int8_t)) {
+                   static_cast<float>(1), &out_slice, static_cast<float>(0),
-        math::matmul(filter_slice, false, col_matrix, false,
+                   false, static_cast<Otype *>(nullptr));
-                     static_cast<float>(1), &out_slice, static_cast<float>(0),
-                     false, static_cast<int32_t *>(nullptr));
-      } else {
-        math::matmul(filter_slice, false, col_matrix, false,
-                     static_cast<float>(1), &out_slice, static_cast<float>(0),
-                     false, static_cast<float *>(nullptr));
-      }
    }
  }
 }

--- a/src/operators/kernel/conv_add_relu_int8_kernel.h
+++ b/src/operators/kernel/conv_add_relu_int8_kernel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADDRELU_INT8_OP
-#pragma once
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using framework::DDim;
-using framework::OpKernelBase;
-template <typename DeviceType, typename T>
-class ConvAddReluInt8Kernel
-    : public OpKernelBase<DeviceType, FusionConvAddReluInt8Param<DeviceType>> {
- public:
-  void Compute(const FusionConvAddReluInt8Param<DeviceType> &param);
-  bool Init(FusionConvAddReluInt8Param<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -2924,6 +2924,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 #endif  // __ARM_NEON
 // 32位 float 矩阵乘法
+template <>
 void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 bool relu, float *bias) {

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "common/log.h"
+#include "memory/t_malloc.h"
 // 矩阵取值运算宏，假设矩阵按行存储
 #define A(i, j) A[(i)*lda + (j)]
@@ -163,11 +164,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                          float *new_bias);
  */
-  // 32位 float 矩阵乘法
-  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
-             float *bias);
  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc,
@@ -201,11 +197,13 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                 int32_t ldc);
  // 8 bits int inner product
+  template <typename Otype>
  void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                   const int8_t *b, float beta, int32_t *c, int32_t *C,
+                   const int8_t *b, float beta, int32_t *c, Otype *C,
                   int32_t ldc, bool relu);
+  template <typename Otype>
  void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                           const int8_t *b, float beta, int32_t *c, int8_t *C,
+                           const int8_t *b, float beta, int32_t *c, Otype *C,
                           int32_t ldc, bool relu, int32_t *bias);
  // 8 bits int pack function
@@ -229,12 +227,15 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                             const int8_t *B, int32_t ldb, int8_t *buffer);
  // 8 bits int matrix product
+  template <typename Itype, typename Btype, typename Otype>
+  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
+             int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
+             int32_t ldc, bool relu, Btype *bias);
+  template <typename Otype>
  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, float beta, int32_t *C,
+             int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, int32_t *bias);
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, float beta, int8_t *C,
             int32_t ldc, bool relu, int32_t *bias);
  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
                 int32_t *C, int32_t ldc, bool relu, int32_t *bias);
@@ -266,6 +267,71 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  int8_t *zero_int8;
 };
+// 8 bits int matrix product (m*k x k*n)
+template <typename Otype>
+void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
+                 Otype *C, int32_t ldc, bool relu, int32_t *bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int32_t L1 = 32 * 1024;
+  int32_t L2 = 512 * 1024;
+  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
+  KC = k_complete;
+  MC = L1 / (KC * sizeof(int8_t));
+  NC = L2 / (KC * sizeof(int8_t));
+  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
+  if (MC == 0) {
+    MC = MR_INT8;
+  } else {
+    int32_t mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+  }
+  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  if (NC == 0) {
+    NC = NR_INT8;
+  } else {
+    int32_t nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+  }
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+  packedA_int8 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+  packedB_int8 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+  packedC_int32 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
+  int32_t mc, nc;
+  for (int32_t j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
+    for (int32_t i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
+      if (bias == nullptr) {
+        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                    packedC_int32, &C(i, j), ldc, relu);
+      } else {
+        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                            packedC_int32, &C(i, j), ldc, relu, bias + i);
+      }
+    }
+  }
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int32);
+  paddle_mobile::memory::Free(zero_int8);
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <string.h>
 #include "common/log.h"
-#include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
 #if __ARM_NEON
 #include <arm_neon.h>
@@ -670,6 +669,11 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
 }
 // 8 bits int inner product
+template <>
+void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+                       const int8_t *b, float beta, int32_t *c, int8_t *C,
+                       int32_t ldc, bool relu) {}
+template <>
 void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
                       const int8_t *b, float beta, int32_t *c, int32_t *C,
                       int32_t ldc, bool relu) {
@@ -691,6 +695,7 @@ void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
  }
 }
+template <>
 void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
                               const int8_t *a, const int8_t *b, float beta,
                               int32_t *c, int8_t *C, int32_t ldc, bool relu,
@@ -715,6 +720,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
  }
 }
+template <>
+void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
+                               const int8_t *a, const int8_t *b, float beta,
+                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                               int32_t *bias) {}
 // 8 bits int PackMatrixA_4r
 void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
                             const int8_t *A, int32_t lda, int8_t *buffer) {
@@ -1083,128 +1094,6 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
  }
 }
-// 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 int32_t *C, int32_t ldc, bool relu, int32_t *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int32_t L1 = 32 * 1024;
-  int32_t L2 = 512 * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  MC = L1 / (KC * sizeof(int8_t));
-  NC = L2 / (KC * sizeof(int8_t));
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
-  if (MC == 0) {
-    MC = MR_INT8;
-  } else {
-    int32_t mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-  }
-  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR_INT8;
-  } else {
-    int32_t nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-  packedA_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-  packedB_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  int32_t mc, nc;
-  for (int32_t j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-    for (int32_t i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      if (bias == nullptr) {
-        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                    packedC_int32, &C(i, j), ldc, relu);
-      }
-    }
-  }
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
-// 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 int8_t *C, int32_t ldc, bool relu, int32_t *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int32_t L1 = 32 * 1024;
-  int32_t L2 = 512 * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  MC = L1 / (KC * sizeof(int8_t));
-  NC = L2 / (KC * sizeof(int8_t));
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
-  if (MC == 0) {
-    MC = MR_INT8;
-  } else {
-    int32_t mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-  }
-  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR_INT8;
-  } else {
-    int32_t nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-  packedA_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-  packedB_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  int32_t mc, nc;
-  for (int32_t j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-    for (int32_t i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      if (bias != nullptr) {
-        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                            packedC_int32, &C(i, j), ldc, relu, bias + i);
-      }
-    }
-  }
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
 //  8 bits int write back
 // C = A * B
 void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1705,36 +1705,19 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
  FusionConvAddReluParam(const VariableNameMap &inputs,
                         const VariableNameMap &outputs,
                         const AttributeMap &attrs, const Scope &scope)
-      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
+      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {
-};
-#endif
 #ifdef FUSION_CONVADDRELU_INT8_OP
-template <typename Dtype>
-class FusionConvAddReluInt8Param : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
- public:
-  FusionConvAddReluInt8Param(const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const AttributeMap &attrs, const Scope &scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
    scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
-    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+#endif
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
  }
+#ifdef FUSION_CONVADDRELU_INT8_OP
+  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
+  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
  const RType *InputScale() const { return scale_; }
-  RType *Bias() const { return bias_; }
-  const int &Axis() const { return axis_; }
 protected:
  RType *scale_;
-  RType *bias_;
+#endif
-  int axis_;
 };
 #endif

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <climits>
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
+#include <limits>
 #include <random>
 #include "../test_helper.h"
 #include "common/log.h"
@@ -57,10 +57,10 @@ void print_matirx(int m, int n, int ldc, int8_t *c) {
 int32_t qadd_int32(int32_t l, int32_t r) {
  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > INT_MAX)
+  if (res > std::numeric_limits<int32_t>::max())
-    return INT_MAX;
+    return std::numeric_limits<int32_t>::max();
-  else if (res < INT_MIN)
+  else if (res < std::numeric_limits<int32_t>::min())
-    return INT_MIN;
+    return std::numeric_limits<int32_t>::min();
  else
    return static_cast<int32_t>(res);
 }

--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef FUSION_CONVADDRELU_INT8_OP
+#include <limits>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/fusion_conv_add_relu_int8_op.h"
@@ -19,10 +23,10 @@ limitations under the License. */
 namespace paddle_mobile {
 int32_t qadd_int32(int32_t l, int32_t r) {
  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > INT_MAX)
+  if (res > std::numeric_limits<int32_t>::max())
-    return INT_MAX;
+    return std::numeric_limits<int32_t>::max();
-  else if (res < INT_MIN)
+  else if (res < std::numeric_limits<int32_t>::min())
-    return INT_MIN;
+    return std::numeric_limits<int32_t>::min();
  else
    return static_cast<int32_t>(res);
 }
@@ -217,8 +221,8 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  inputs["Input"] = std::vector<std::string>({"input"});
  inputs["Filter"] = std::vector<std::string>({"filter"});
  inputs["Scale"] = std::vector<std::string>({"scale"});
-  inputs["Y"] = std::vector<std::string>({"y"});
+  inputs["Y"] = std::vector<std::string>({"bias"});
-  outputs["Output"] = std::vector<std::string>({"output"});
+  outputs["Out"] = std::vector<std::string>({"output"});
  auto input_var = scope.get()->Var("input");
  auto input = input_var->template GetMutable<framework::LoDTensor>();
@@ -234,7 +238,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  float scale_v = 0.000828f;
  scale->mutable_data<float>()[0] = scale_v;
-  auto bias_var = scope.get()->Var("y");
+  auto bias_var = scope.get()->Var("bias");
  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
  SetupTensor<int32_t>(bias, bias_shape, -127, 127);
@@ -352,3 +356,5 @@ int main(int argc, char *argv[]) {
  paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
                                             out_channels);
 }
+#endif
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"