Merge pull request #1336 from wzzju/add_fusion_fc_int8_op

add fusion fc int8_t op and its UT.

Merge pull request #1336 from wzzju/add_fusion_fc_int8_op
add fusion fc int8_t op and its UT.
6ce11736 · Jiaying Zhao · GitHub · ce969f68 · 3dbf9966 · 6ce11736
20 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -32,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
 const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
 const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
 const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
@@ -111,12 +112,13 @@ std::unordered_map<
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -103,6 +103,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
 extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FC_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;

--- a/src/operators/fusion_conv_add_relu_int8_op.h
+++ b/src/operators/fusion_conv_add_relu_int8_op.h
@@ -22,19 +22,19 @@ namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 class FusionConvAddReluInt8Op
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddReluParam<DeviceType>,
-          operators::ConvAddReluKernel<DeviceType, T>> {
+    : public framework::OperatorWithKernel<DeviceType,
+                                           FusionConvAddReluParam<DeviceType>,
+                                           ConvAddReluKernel<DeviceType, T>> {
 public:
  FusionConvAddReluInt8Op(const std::string &type,
                          const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const framework::AttributeMap &attrs,
                          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddReluParam<DeviceType>,
-            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
+      : framework::OperatorWithKernel<DeviceType,
+                                      FusionConvAddReluParam<DeviceType>,
+                                      ConvAddReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
  void InferShape() const override;
 };
 }  // namespace operators

--- a/src/operators/fusion_fc_int8_op.cpp
+++ b/src/operators/fusion_fc_int8_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_INT8_OP
+
+#include "operators/fusion_fc_int8_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionFcInt8Op<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  auto y_dims = this->param_.InputY()->dims();
+  int x_num_col_dims = this->param_.XNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
+
+  assert(x_dims.size() > x_num_col_dims);
+  assert(y_dims.size() > y_num_col_dims);
+
+  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
+  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+
+  assert(x_mat_dims[1] == y_mat_dims[0]);
+
+  std::vector<int64_t> output_dims;
+  output_dims.reserve(
+      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    output_dims.push_back(x_dims[i]);
+  }
+
+  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+    output_dims.push_back(y_dims[i]);
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_dims);
+  this->param_.Out()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
+#endif
+#endif  // FUSION_FC_INT8_OP
--- a/src/operators/fusion_fc_int8_op.h
+++ b/src/operators/fusion_fc_int8_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_INT8_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class FusionFcInt8Op
+    : public framework::OperatorWithKernel<DeviceType,
+                                           FusionFcParam<DeviceType>,
+                                           FusionFcKernel<DeviceType, T>> {
+ public:
+  FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
+                                      FusionFcKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_FC_INT8_OP
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {

 template <>
 void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
-  FusionFcCompute<float>(param);
+  FusionFcCompute<float, float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

+template class FusionFcKernel<CPU, float>;
+
+#ifdef FUSION_FC_INT8_OP
+template <>
+bool FusionFcKernel<CPU, int8_t>::Init(FusionFcParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void FusionFcKernel<CPU, int8_t>::Compute(const FusionFcParam<CPU> &param) {
+  FusionFcCompute<int8_t, int32_t>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+template class FusionFcKernel<CPU, int8_t>;
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -39,8 +39,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  float beta = 1.0f;

 #ifdef FUSION_CONVADDRELU_INT8_OP
-  Tensor scale = *param.InputScale();
-  alpha = scale.data<float>()[0];
+  alpha = param.InputScale()->data<float>()[0];
  beta = 0.0f;
 #endif


--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -15,23 +15,29 @@ limitations under the License. */
 #ifdef FUSION_FC_OP

 #pragma once
+
+#include <type_traits>
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

-template <typename P>
+template <typename P, typename S>
 void FusionFcCompute(const FusionFcParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
+  Tensor *input_z = param.InputZ();
+  S *input_z_data = input_z->data<S>();
  int axis = param.Axis();
  Tensor *out = param.Out();
  //  int m = out->dims()[0];
  //  int n = out->dims()[1];
-  auto *out_data = out->mutable_data<float>();
+  auto *out_data = out->mutable_data<P>();
+
+  float alpha = 1.0f;
+  float beta = 1.0f;
+
  const Tensor x_matrix =
      input_x->dims().size() > 2
          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -51,17 +57,24 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");

+  if (std::is_same<P, int8_t>::value) {
+#ifdef FUSION_FC_INT8_OP
+    alpha = param.InputScale()->data<float>()[0];
+    beta = 0.0f;
+    math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
+                 input_z_data, true);
+#endif
+  } else {
+    // bias_data的维度和out的第二个维度一致
    int64_t classes = input_z->numel();
    for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+      memory::Copy(out_data + i * classes, input_z_data,
+                   sizeof(float) * classes);
    }

-  //  for (int i = 0; i < out->numel(); i++) {
-  //    DLOG << out_data[i];
-  //  }
-  // bias_data的维度和out的维度一致
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1), false);
+    math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
+                        false);
+  }
  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
  //  if (out_dim.size() != 2) {
  //      out->Resize(out_dim);

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -2924,7 +2924,6 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
 #endif  // __ARM_NEON

 // 32位 float 矩阵乘法
-template <>
 void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 bool relu, float *bias) {
@@ -3147,7 +3146,6 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
 }

 // 32位 float 矩阵乘法
-template <>
 void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
                     const float *B, int ldb, float beta, float *C, int ldc,
                     bool relu, float *bias) {

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -167,14 +167,25 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                          float *new_bias);
  */

+  // 32位 float 矩阵乘法
+  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+             float *bias);
+
  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc,
                   bool relu, float *new_scale, float *new_bias, float *bias);
+
  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
                      const float *B, int ldb, float *C, int ldc, float *p,
                      std::string mode, float *bias, float *bias1);

+  // 32位 float 矩阵乘法（openmp 多线程版本）
+  void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *bias);
+
  // 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
  void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
                       int lda, const float *B, int ldb, float beta, float *C,
@@ -202,7 +213,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  template <typename Otype>
  void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
                           const int8_t *b, float beta, int32_t *c, Otype *C,
-                           int32_t ldc, bool relu, int32_t *bias);
+                           int32_t ldc, bool relu, int32_t *bias,
+                           bool addOnRow = false);

  // 8 bits int pack function
  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
@@ -228,28 +240,32 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  template <typename Itype, typename Btype, typename Otype>
  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
                 int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
-                 int32_t ldc, bool relu, Btype *bias);
+                 int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
  template <typename Otype>
  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 Otype *C, int32_t ldc, bool relu, int32_t *bias);
+                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
+                 bool addOnRow = false);
  template <typename Itype, typename Btype, typename Otype>
  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
             int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, Btype *bias);
+             int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
  template <typename Otype>
  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
             int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, int32_t *bias);
+             int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false);
  // 8 bits int write back
  // C = A * B
  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
  // C = A * B + bias, scale * relu(C)
  void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
                             int32_t ldc, int32_t *bias, float scale);
-  // C = A * B + bias, scale * C
+  // C = A * B + bias, scale * C, bias is added on column
  void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
                         int32_t ldc, int32_t *bias, float scale);
+  // C = A * B + bias, scale * C, bias is added on row
+  void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                          int32_t ldc, int32_t *bias, float scale);

 private:
  int MC = 0;
@@ -273,7 +289,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 template <typename Otype>
 void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 Otype *C, int32_t ldc, bool relu, int32_t *bias) {
+                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
+                 bool addOnRow) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int32_t L1 = 32 * 1024;
@@ -322,8 +339,15 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
                    packedC_int32, &C(i, j), ldc, relu);
      } else {
+        if (addOnRow) {
          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                            packedC_int32, &C(i, j), ldc, relu, bias + i);
+                              packedC_int32, &C(i, j), ldc, relu, bias + j,
+                              addOnRow);
+        } else {
+          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                              packedC_int32, &C(i, j), ldc, relu, bias + i,
+                              addOnRow);
+        }
      }
    }
  }
@@ -339,7 +363,7 @@ template <typename Otype>
 void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
                     float beta, Otype *C, int32_t ldc, bool relu,
-                     int32_t *bias) {
+                     int32_t *bias, bool addOnRow) {
 #ifdef _OPENMP
  int32_t max_threads = omp_get_max_threads();
 #else
@@ -422,8 +446,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
        InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
                    &C(i, 0), ldc, relu);
      } else {
-        InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
-                            &C(i, 0), ldc, relu, bias + i);
+        if (addOnRow) {
+          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
+                              local_C, &C(i, 0), ldc, relu, bias, addOnRow);
+        } else {
+          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
+                              local_C, &C(i, 0), ldc, relu, bias + i, addOnRow);
+        }
      }
    }
  } else {
@@ -447,8 +476,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
        InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
                    &C(0, j), ldc, relu);
      } else {
-        InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
-                            &C(0, j), ldc, relu, bias);
+        if (addOnRow) {
+          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
+                              local_C, &C(0, j), ldc, relu, bias + j, addOnRow);
+        } else {
+          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
+                              local_C, &C(0, j), ldc, relu, bias, addOnRow);
+        }
      }
    }
  }

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -699,7 +699,7 @@ template <>
 void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
                               const int8_t *a, const int8_t *b, float beta,
                               int32_t *c, int8_t *C, int32_t ldc, bool relu,
-                               int32_t *bias) {
+                               int32_t *bias, bool addOnRow) {
 #pragma omp parallel for
  for (int32_t j = 0; j < nc; j += NR_INT8) {
    for (int32_t i = 0; i < mc; i += MR_INT8) {
@@ -715,16 +715,20 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
  if (relu) {
    WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
    return;
+  } else {
+    if (addOnRow) {
+      WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha);
    } else {
      WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
    }
+  }
 }

 template <>
 void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
                               const int8_t *a, const int8_t *b, float beta,
                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
-                               int32_t *bias) {}
+                               int32_t *bias, bool addOnRow) {}

 // 8 bits int PackMatrixA_4r
 void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
@@ -1159,14 +1163,13 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
 #endif  // __ARM_NEON
 }

-// C = A * B + bias, scale * C
+// C = A * B + bias, scale * C, bias is added on column
 void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
                             int32_t ldc, int32_t *bias, float scale) {
 #if __ARM_NEON
 #if __aarch64__
 // TODO
 #else
-  int32_t zero = 0;
  int8_t narrow = -128;
  int32_t nc1 = nc >> 3;
  int32_t _nc1 = nc & 7;
@@ -1184,7 +1187,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
        "subs       %[mc], %[mc], #1        \n\t"
        "blt        end_mc_%=               \n\t"
        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.32    q14,  %[zero]           \n\t"
        "vdup.8     d24,  %[narrow]         \n\t"
        "loop_mc_%=:                        \n\t"
        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
@@ -1222,9 +1224,9 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
        :
        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [zero] "r"(zero), [narrow] "r"(narrow)
+          [scale] "r"(scale), [narrow] "r"(narrow)
        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-          "q7", "q12", "q13", "q14", "q15");
+          "q7", "q12", "q13", "q15");
  }

  int32_t nc_left;
@@ -1239,7 +1241,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
      nc_left = _nc1;
      asm volatile(
          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.32    q14,  %[zero]           \n\t"
          "vdup.8     d24,  %[narrow]         \n\t"
          "vdup.32    q13,  %[bias_v]         \n\t"
          "cmp        %[_nc1], #4             \n\t"
@@ -1260,7 +1261,7 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
          "subs       %[_nc1], %[_nc1], #4    \n\t"
          "beq        process_over_%=         \n\t"
          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
+          "vld1.32    {q0}, [%[c0]]          \n\t"
          "vqadd.s32  q0, q0, q13             \n\t"
          "vcvt.f32.s32 q1, q0                \n\t"
          "vmul.f32   q1, q1, q15             \n\t"
@@ -1277,17 +1278,138 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
          "process_over_%=:                   \n\t"
          :
          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero),
-            [narrow] "r"(narrow)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
-            "q15");
+            [bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
+    }
+  }
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
+// C = A * B + bias, scale * C, bias is added on row
+void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                              int32_t ldc, int32_t *bias, float scale) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  int8_t narrow = -128;
+  int32_t nc1 = nc >> 3;
+  int32_t _nc1 = nc & 7;
+  int32_t step = sizeof(int8_t) * ldc;
+  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
+  int32_t volatile m = mc;
+  int32_t volatile n = nc1;
+  int32_t *volatile c_ptr, *volatile bias_ptr;
+  int8_t *volatile C_ptr;
+  c_ptr = c;
+  C_ptr = C;
+  bias_ptr = bias;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "vdup.32    q15,  %[scale]          \n\t"
+        "vdup.8     d24,  %[narrow]         \n\t"
+        "loop_mc_%=:                        \n\t"
+        "mov        r4,   %[bias_ptr]       \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q13, q14}, [r4]!        \n\t"
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vqadd.s32  q0, q0, q13             \n\t"
+        "vqadd.s32  q1, q1, q14             \n\t"
+        "vcvt.f32.s32 q2, q0                \n\t"
+        "vcvt.f32.s32 q3, q1                \n\t"
+        "vmul.f32   q2, q2, q15             \n\t"
+        "vmul.f32   q3, q3, q15             \n\t"
+        "vcvt.s32.f32 q4, q2                \n\t"
+        "vcvt.s32.f32 q5, q3                \n\t"
+        "vqmovn.s32 d12, q4                 \n\t"
+        "vqmovn.s32 d13, q5                 \n\t"
+        "vqmovn.s16 d14, q6                 \n\t"
+        "vceq.s8    d15, d14, d24           \n\t"
+        "vsub.s8    d14, d14, d15           \n\t"
+        "vst1.8     {d14}, [r6]!            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
+          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
+          [scale] "r"(scale), [narrow] "r"(narrow)
+        : "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q12", "q13", "q15");
+  }
+
+  int32_t nc_left;
+  int32_t *c0;
+  int8_t *C0;
+  int32_t *volatile bias0 = bias_ptr + nc1 * 8;
+  if (_nc1 != 0) {
+    for (int32_t i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 8 + i * ldc;
+      c0 = c_ptr + nc1 * 8 + i * NC;
+      nc_left = _nc1;
+      asm volatile(
+          "vdup.32    q15,  %[scale]          \n\t"
+          "vdup.8     d24,  %[narrow]         \n\t"
+          "cmp        %[_nc1], #4             \n\t"
+          "blt        less_four_%=            \n\t"
+          "vld1.32    {q0}, [%[c0]]!          \n\t"
+          "vld1.32    {q13}, [%[bias0]]!      \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vceq.s8    d9, d8, d24             \n\t"
+          "vsub.s8    d8, d8, d9              \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
+          "subs       %[_nc1], %[_nc1], #4    \n\t"
+          "beq        process_over_%=         \n\t"
+          "less_four_%=:                      \n\t"
+          "vld1.32    {q0}, [%[c0]]           \n\t"
+          "vld1.32    {q13}, [%[bias0]]       \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vceq.s8    d9, d8, d24             \n\t"
+          "vsub.s8    d8, d8, d9              \n\t"
+          "loop_save_%=:                      \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vext.8 d8, d8, d8, #1              \n\t"
+          "subs       %[_nc1], %[_nc1], #1    \n\t"
+          "bgt        loop_save_%=            \n\t"
+          "process_over_%=:                   \n\t"
+          :
+          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0),
+            [scale] "r"(scale), [narrow] "r"(narrow)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
    }
  }
 #endif  // __aarch64__
 #endif  // __ARM_NEON
 }

-// C = A * B + bias, scale * relu(C)
+// C = A * B + bias, scale * relu(C), bias is added on column
 void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
                                 int32_t ldc, int32_t *bias, float scale) {
 #if __ARM_NEON

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -34,7 +34,7 @@ template <typename T, typename S>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false,
-            S *bias = nullptr);
+            S *bias = nullptr, bool addOnRow = false);

 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -24,8 +24,8 @@ namespace math {
 template <>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, float beta, bool relu,
-            int32_t *bias) {
+            framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
+            bool addOnRow) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -55,18 +55,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
 #ifdef _OPENMP
    if (bias != nullptr) {
      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int8_t>(), N, relu, bias);
+                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
    } else {
      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias);
+                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
    }
 #else
    if (bias != nullptr) {
      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int8_t>(), N, relu, bias);
+                 matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
    } else {
      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int32_t>(), N, relu, bias);
+                 matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
    }
 #endif
  } else {
@@ -74,21 +74,21 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
    if (bias != nullptr) {
      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int8_t>(), N, relu, bias);
+                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
    } else {
      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias);
+                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
    }
 #else
    if (bias != nullptr) {
      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
-                 N, relu, bias);
+                 N, relu, bias, addOnRow);
    } else {
      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
-                 N, relu, bias);
+                 N, relu, bias, addOnRow);
    }
 #endif
  }

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
+
+#ifdef FUSION_FC_INT8_OP
+    scale_ = InputScaleFrom<GType>(inputs, scope);
+#endif
  }
  GType *InputX() const { return input_x_; }

@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
  int x_num_col_dims_;
  int y_num_col_dims_;
  int axis_;
-#ifdef PADDLE_MOBILE_FPGA
+
+#ifdef FUSION_FC_INT8_OP
+ public:
+  const RType *InputScale() const { return scale_; }

+ private:
+  RType *scale_;
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
 private:
  fpga::SplitConvArgs fpga_conv_args;

@@ -1717,7 +1729,7 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
  const RType *InputScale() const { return scale_; }

- protected:
+ private:
  RType *scale_;
 #endif
 };

--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -25,7 +25,7 @@ limitations under the License. */
 #define c(i, j) c[(i)*ldc + (j)]
 #define c1(i, j) c1[(i)*ldc + (j)]

-void print_matirx(int m, int n, int ldc, float *c) {
+void print_matrix(int m, int n, int ldc, float *c) {
  for (int i = 0; i < m; ++i) {
    std::cout << c(i, 0);
    for (int j = 1; j < n; ++j) {
@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {

  if (pr > 0) {
    std::cout << "A:" << std::endl;
-    print_matirx(m, k, lda, a);
+    print_matrix(m, k, lda, a);
    std::cout << "B:" << std::endl;
-    print_matirx(k, n, ldb, b);
+    print_matrix(k, n, ldb, b);
    std::cout << "C:" << std::endl;
-    print_matirx(m, n, ldc, c);
+    print_matrix(m, n, ldc, c);
    std::cout << "C1:" << std::endl;
-    print_matirx(m, n, ldc, c1);
+    print_matrix(m, n, ldc, c1);
  }

  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
            << "   eq=" << eq << " neq=" << neq << std::endl;

+  PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
+
  paddle_mobile::memory::Free(a);
  paddle_mobile::memory::Free(b);
  paddle_mobile::memory::Free(c);

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include <limits>
 #include <random>
+#include <type_traits>
 #include "../test_helper.h"
 #include "common/log.h"
 #include "memory/t_malloc.h"
@@ -33,24 +34,32 @@ limitations under the License. */
 using std::default_random_engine;
 using std::uniform_int_distribution;

-void print_matirx(int m, int n, int ldc, int32_t *c) {
+template <typename T>
+void print_matrix(int m, int n, int ldc, T *c) {
  for (int i = 0; i < m; ++i) {
+    if (std::is_same<T, int8_t>::value) {
+      std::cout.setf(std::ios::left);
+      std::cout.width(4);
+      std::cout << static_cast<int32_t>(c(i, 0));
+    } else {
+      std::cout.setf(std::ios::left);
+      std::cout.width(6);
      std::cout << c(i, 0);
-    for (int j = 1; j < n; ++j) {
-      std::cout << " | " << c(i, j);
-    }
-    std::cout << std::endl;
    }
-  std::cout << std::endl;
-}
-
-void print_matirx(int m, int n, int ldc, int8_t *c) {
-  for (int i = 0; i < m; ++i) {
-    std::cout << static_cast<int32_t>(c(i, 0));
    for (int j = 1; j < n; ++j) {
-      std::cout << " | " << static_cast<int32_t>(c(i, j));
+      if (std::is_same<T, int8_t>::value) {
+        std::cout << " | ";
+        std::cout.setf(std::ios::left);
+        std::cout.width(4);
+        std::cout << static_cast<int32_t>(c(i, j));
+      } else {
+        std::cout << " | ";
+        std::cout.setf(std::ios::left);
+        std::cout.width(6);
+        std::cout << c(i, j);
      }
-    std::cout << std::endl;
+    }
+    std::cout << "\n";
  }
  std::cout << std::endl;
 }
@@ -138,18 +147,20 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {

  if (pr > 0) {
    std::cout << "A:" << std::endl;
-    print_matirx(m, k, lda, a);
+    print_matrix(m, k, lda, a);
    std::cout << "B:" << std::endl;
-    print_matirx(k, n, ldb, b);
+    print_matrix(k, n, ldb, b);
    std::cout << "C:" << std::endl;
-    print_matirx(m, n, ldc, c);
+    print_matrix(m, n, ldc, c);
    std::cout << "C1:" << std::endl;
-    print_matirx(m, n, ldc, c1);
+    print_matrix(m, n, ldc, c1);
  }

  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
            << "   eq=" << eq << " neq=" << neq << std::endl;

+  PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
+
  paddle_mobile::memory::Free(a);
  paddle_mobile::memory::Free(b);
  paddle_mobile::memory::Free(c);
@@ -158,7 +169,8 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
  return 0;
 }

-int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
+int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr,
+                       bool addOnRow = false) {
  int lda = k;
  int ldb = n;
  int ldc = n;
@@ -174,8 +186,14 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
  int8_t *c1 = static_cast<int8_t *>(
      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));

-  int32_t *bias =
-      static_cast<int32_t *>(paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
+  int32_t *bias = nullptr;
+  if (addOnRow) {
+    bias = static_cast<int32_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int32_t) * n));
+  } else {
+    bias = static_cast<int32_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
+  }

  for (int i = 0; i < m * k; ++i) {
    a[i] = pixel(e);
@@ -183,6 +201,24 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
  for (int i = 0; i < k * n; ++i) {
    b[i] = pixel(e);
  }
+
+  if (addOnRow) {
+    for (int i = 0; i < n; ++i) {
+      bias[i] = static_cast<int32_t>(pixel(e));
+    }
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        int32_t bias_v = bias[j];
+        int32_t r = 0;
+        for (int p = 0; p < k; p++) {
+          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
+        }
+        r = qadd_int32(r, bias_v);
+        if (relu) r = std::max(0, r);
+        c1(i, j) = qscale_int32(r, scale);
+      }
+    }
+  } else {
    for (int i = 0; i < m; ++i) {
      bias[i] = static_cast<int32_t>(pixel(e));
    }
@@ -198,14 +234,15 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
        c1(i, j) = qscale_int32(r, scale);
      }
    }
+  }

  paddle_mobile::operators::math::Gemm gemm;
 #ifdef _OPENMP
  gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
-                 relu, bias);
+                 relu, bias, addOnRow);
 #else
  gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
-             relu, bias);
+             relu, bias, addOnRow);
 #endif
  int eq = 0;
  int neq = 0;
@@ -219,20 +256,27 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {

  if (pr > 0) {
    std::cout << "A:" << std::endl;
-    print_matirx(m, k, lda, a);
+    print_matrix(m, k, lda, a);
    std::cout << "B:" << std::endl;
-    print_matirx(k, n, ldb, b);
+    print_matrix(k, n, ldb, b);
    std::cout << "Bias:" << std::endl;
-    print_matirx(m, 1, 1, bias);
+    if (addOnRow) {
+      print_matrix(1, n, n, bias);
+    } else {
+      print_matrix(m, 1, 1, bias);
+    }
    std::cout << "C:" << std::endl;
-    print_matirx(m, n, ldc, c);
+    print_matrix(m, n, ldc, c);
    std::cout << "C1:" << std::endl;
-    print_matirx(m, n, ldc, c1);
+    print_matrix(m, n, ldc, c1);
  }

  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
            << "   eq=" << eq << " neq=" << neq << std::endl;

+  PADDLE_MOBILE_ENFORCE(neq == 0,
+                        "The execution of do_sgemm_with_bias is failed!");
+
  paddle_mobile::memory::Free(a);
  paddle_mobile::memory::Free(b);
  paddle_mobile::memory::Free(c);
@@ -261,7 +305,7 @@ int main() {

  std::cout << "\n\n******************************************************\n\n"
            << std::endl;
-  std::cout << "Test gemm with bias:" << std::endl;
+  std::cout << "Test gemm with bias(bias is added on column):" << std::endl;
  do_sgemm_with_bias(9, 9, 9, false, 1);
  do_sgemm_with_bias(10, 6, 12, false, 0);
  do_sgemm_with_bias(512, 256, 384, false, 0);
@@ -272,6 +316,19 @@ int main() {
  do_sgemm_with_bias(333, 797, 939, false, 0);
  do_sgemm_with_bias(1024, 1024, 1024, false, 0);

+  std::cout << "\n\n******************************************************\n\n"
+            << std::endl;
+  std::cout << "Test gemm with bias(bias is added on row):" << std::endl;
+  do_sgemm_with_bias(9, 9, 9, false, 1, true);
+  do_sgemm_with_bias(10, 6, 12, false, 0, true);
+  do_sgemm_with_bias(512, 256, 384, false, 0, true);
+  do_sgemm_with_bias(1366, 768, 256, false, 0, true);
+  do_sgemm_with_bias(1255, 755, 333, false, 0, true);
+  do_sgemm_with_bias(599, 1133, 393, false, 0, true);
+  do_sgemm_with_bias(777, 555, 999, false, 0, true);
+  do_sgemm_with_bias(333, 797, 939, false, 0, true);
+  do_sgemm_with_bias(1024, 1024, 1024, false, 0, true);
+
  std::cout << "\n\n******************************************************\n\n"
            << std::endl;
  std::cout << "Test gemm with relu and bias:" << std::endl;

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -49,7 +49,8 @@ int main() {
  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
  auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
  auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
-  int32_t* bias_data = new int32_t[m];
+  int32_t* bias_data_col = new int32_t[m];
+  int32_t* bias_data_row = new int32_t[n];

  for (int i = 0; i < m * k; ++i) {
    aaptr_int8[i] = static_cast<int8_t>(2);
@@ -62,7 +63,11 @@ int main() {
  }

  for (int i = 0; i < m; ++i) {
-    bias_data[i] = 2;
+    bias_data_col[i] = 2;
+  }
+
+  for (int i = 0; i < n; ++i) {
+    bias_data_row[i] = 2;
  }

  // float
@@ -73,14 +78,15 @@ int main() {
        false, nullptr);
  }

-  auto time1 = time();
+  auto time_start0 = time();
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul<float>(
        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
        false, nullptr);
  }
-  auto time2 = time();
-  std::cout << "float gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+  auto time_end0 = time();
+  std::cout << "float gemm  cost :" << time_diff(time_start0, time_end0) / 10
+            << "ms\n";

  // int8_t without bias
  // warm-up 10 times
@@ -90,33 +96,69 @@ int main() {
        static_cast<float>(0));
  }

-  auto time3 = time();
+  auto time_start1 = time();
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul<float, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
        static_cast<float>(0));
  }
-  auto time4 = time();
-  std::cout << "int8_t gemm  cost :" << time_diff(time3, time4) / 10 << "ms\n";
+  auto time_end1 = time();
+  std::cout << "int8_t gemm  cost :" << time_diff(time_start1, time_end1) / 10
+            << "ms\n";
+
+  // int8_t with bias, column element wise add
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_col, false);
+  }
+  auto time_start2 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_col, false);
+  }
+  auto time_end2 = time();
+  std::cout << "int8_t gemm_with_bias(column add) cost :"
+            << time_diff(time_start2, time_end2) / 10 << "ms\n";
+
+  // int8_t with bias, row element wise add
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_row, true);
+  }
+  auto time_start3 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_row, true);
+  }
+  auto time_end3 = time();
+  std::cout << "int8_t gemm_with_bias(row add) cost :"
+            << time_diff(time_start3, time_end3) / 10 << "ms\n";

  // int8_t with bias&relu
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), true, bias_data);
+        static_cast<float>(0), true, bias_data_col, false);
  }
-  auto time5 = time();
+  auto time_start4 = time();
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), true, bias_data);
+        static_cast<float>(0), true, bias_data_col, false);
  }
-  auto time6 = time();
+  auto time_end4 = time();
  std::cout << "int8_t gemm_with_bias_relu cost :"
-            << time_diff(time5, time6) / 10 << "ms\n";
+            << time_diff(time_start4, time_end4) / 10 << "ms\n";

-  delete[] bias_data;
+  delete[] bias_data_row;
+  delete[] bias_data_col;

  return 0;
 }
--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_CONVADDRELU_INT8_OP
-
 #include <iostream>
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
 #include <limits>
 #include "../test_helper.h"
 #include "../test_include.h"
@@ -356,5 +356,9 @@ int main(int argc, char *argv[]) {
  paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
                                             out_channels);
 }
-
+#else
+int main() {
+  std::cout << "FUSION_CONVADDRELU_INT8_OP is not defined!" << std::endl;
+  return 0;
+}
 #endif
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <framework/program/program-optimize/program_optimize.h>
+#include <iostream>
+#include <type_traits>
+#include "../test_helper.h"
 #include "../test_include.h"
+#include "framework/operator.h"
+#include "operators/fusion_fc_int8_op.h"
 #include "operators/fusion_fc_op.h"

+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+
 namespace paddle_mobile {
-namespace framework {
+using framework::AttributeMap;
+using framework::DDim;
+using framework::Scope;
+using framework::make_ddim;
+
+int32_t qadd_int32(int32_t l, int32_t r) {
+  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
+  if (res > std::numeric_limits<int32_t>::max())
+    return std::numeric_limits<int32_t>::max();
+  else if (res < std::numeric_limits<int32_t>::min())
+    return std::numeric_limits<int32_t>::min();
+  else
+    return static_cast<int32_t>(res);
+}

-template <typename Dtype>
-class TestFcOp {
- public:
-  explicit TestFcOp(const Program<Dtype> p) : program_(p) {
-    use_optimize_ = true;
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
+// round to zero
+float round2zero(float v) {
+  float res;
+  if (v > 0)
+    res = std::floor(v);
+  else if (v < 0)
+    res = std::ceil(v);
+  return res;
+}
+
+int8_t qscale_int32(int32_t v, float scale) {
+  float res = static_cast<float>(v) * scale;
+  res = round2zero(res);
+  if (res > 127)
+    return static_cast<int8_t>(127);
+  else if (res < -127)
+    return static_cast<int8_t>(-127);
+  else
+    return static_cast<int8_t>(res);
+}
+
+template <typename T, typename S>
+int TestFcOP() {
+  int32_t m = 377;
+  int32_t n = 1363;
+  int32_t k = 577;
+  int32_t lda = k;
+  int32_t ldb = n;
+  int32_t ldc = n;
+  DDim inputA_shape = make_ddim({m, k});
+  DDim inputB_shape = make_ddim({k, n});
+  DDim bias_shape = make_ddim({n});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<Scope>();
+  inputs["X"] = std::vector<std::string>({"inputA"});
+  inputs["Y"] = std::vector<std::string>({"inputB"});
+  inputs["Z"] = std::vector<std::string>({"bias"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto inputA_var = scope.get()->Var("inputA");
+  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(inputA, inputA_shape, -127, 127);
+  auto inputB_var = scope.get()->Var("inputB");
+  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(inputB, inputB_shape, -127, 127);
+  auto bias_var = scope.get()->Var("bias");
+  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<S>(bias, bias_shape, -127, 127);
+
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  scale->Resize(framework::make_ddim({1}));
+  float scale_v = 0.000828f;
+  scale->mutable_data<float>()[0] = scale_v;
+
+  auto output_var = scope.get()->Var("output");
+  AttributeMap attrs;
+  attrs["x_num_col_dims"].Set<int>(1);
+  attrs["y_num_col_dims"].Set<int>(1);
+  attrs["axis"].Set<int>(1);
+  operators::OperatorBase<CPU> *op = nullptr;
+#ifdef FUSION_FC_INT8_OP
+  if (std::is_same<T, int8_t>::value) {
+    op = new operators::FusionFcInt8Op<CPU, T>("fusion_fc_int8", inputs,
+                                               outputs, attrs, scope);
  } else {
-      to_predict_program_ = program_.originProgram;
+    op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
+                                           scope);
  }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "fc" && op->Input("X")[0] == "pool2d_13.tmp_0") {
-          DLOG << " fc attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Input Y is : " << op->Input("Z")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
-              std::make_shared<operators::FusionFcOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(testOp);
+#else
+  op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
+                                         scope);
+#endif
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const T *output_data = output->data<T>();
+  // compare
+  T *c = static_cast<T *>(memory::Alloc(sizeof(T) * m * n));
+  T *a = inputA->data<T>();
+  T *b = inputB->data<T>();
+  S *bias_data = bias->data<S>();
+  for (int32_t i = 0; i < m; ++i) {
+    for (int32_t j = 0; j < n; ++j) {
+      S bias_v = bias_data[j];
+      if (std::is_same<T, int8_t>::value) {
+        int32_t r = 0;
+        for (int32_t p = 0; p < k; p++) {
+          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
        }
+        r = qadd_int32(r, bias_v);
+        c(i, j) = qscale_int32(r, scale_v);
+      } else {
+        T r = 0;
+        for (int32_t p = 0; p < k; p++) {
+          r += a(i, p) * b(p, j);
        }
+        r += bias_v;
+        c(i, j) = r;
      }
    }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2,
-                                  const Tensor &t3) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
-    auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
-    auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
-    tensor_z->ShareDataWith(t3);
-
-    Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
-    auto *output_tensor = con_output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({3, 10});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t1, t2, t3, 0);
-    return out_tensor;
  }

- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-               int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestFcOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Fc Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //    "../../../test/models/googlenet"
-  auto program = loader.Load(g_googlenet);
-  paddle_mobile::framework::ProgramOptimize optimize;
-  //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FusionOptimize(program.originProgram);
-
-  program.optimizeProgram = optimize_program;
-
-  if (optimize_program != nullptr) {
-    optimize_program->Description("optimize");
+  int32_t eq = 0;
+  int32_t neq = 0;
+  for (int32_t i = 0; i < m * n; ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == c[i],
+                          "The execution of test_fusion_fc_op is failed!");
+    if (output_data[i] == c[i]) {
+      ++eq;
    } else {
-    LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
+      ++neq;
    }
-
-  /// input x (1,3,224,224)
-  paddle_mobile::framework::LoDTensor inputx;
-  SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-  /// input y (224,)
-  paddle_mobile::framework::LoDTensor inputy;
-  SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
-                     static_cast<float>(1.5));
-  auto *inputy_ptr = inputy.data<float>();
-
-  paddle_mobile::framework::LoDTensor inputz;
-  SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputz_ptr = inputz.data<float>();
-
-  paddle_mobile::framework::TestFcOp<paddle_mobile::CPU> testFcOp(program);
-
-  auto output = testFcOp.predict(inputx, inputy, inputz);
-  auto *output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << "value of output: " << output_ptr[j];
  }
+  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+            << " neq=" << neq << std::endl;
+  delete op;
+  return 0;
+}
+}  // namespace paddle_mobile

-  DLOG << "1 (3,64) * 2 (64,10) = 96(3,10)";
-  DLOG << "output : 96(3,10) + bias(10)";
-
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+#ifdef FUSION_FC_INT8_OP
+  paddle_mobile::TestFcOP<int8_t, int32_t>();
+#endif
+  paddle_mobile::TestFcOP<float, float>();
  return 0;
 }
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -214,6 +214,7 @@ if(NOT FOUND_MATCH)
  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
  set(FUSION_CONVADDRELU_INT8_OP ON)
+  set(FUSION_FC_INT8_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
  set(MUL_OP ON)
@@ -322,6 +323,9 @@ endif()
 if (FUSION_FC_OP)
  add_definitions(-DFUSION_FC_OP)
 endif()
+if(FUSION_FC_INT8_OP)
+  add_definitions(-DFUSION_FC_INT8_OP)
+endif()
 if (LRN_OP)
  add_definitions(-DLRN_OP)
 endif()