merge develop

19998c90 · lijiancheng0614 · 2cb17714 · 5c289362 · 19998c90 · 19998c90
28 changed file
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -202,6 +202,12 @@ LOAD_OP1(multiclass_nms, CPU);
 #ifdef POLYGONBOXTRANSFORM_OP
 LOAD_OP1(polygon_box_transform, CPU);
 #endif
+#ifdef SUM_OP
+LOAD_OP1(sum, CPU);
+#endif
+#ifdef ELEMENTWISEMUL_OP
+LOAD_OP1(elementwise_mul, CPU);
+#endif
 #ifdef SLICE_OP
 LOAD_OP2(slice, CPU, MALI_GPU);
 #endif
@@ -209,5 +215,8 @@ LOAD_OP2(slice, CPU, MALI_GPU);
 LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
+#ifdef ELEMENTWISESUB_OP
+LOAD_OP1(elementwise_sub, CPU)
+#endif
 LOAD_OP1(quantize, CPU);
 LOAD_OP1(dequantize, CPU);
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -32,7 +32,7 @@ template <typename Dtype>
 vector<string> OperatorBase<Dtype>::GetInputKeys() const {
  auto it = op_input_output_key.find(type_);
  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no outputs";
+    DLOG << type_ << " has no inputs";
    return {};
  }
  return it->second.first;

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -338,10 +338,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
  for (int i = 0; i < tensor.numel(); i += stride) {
    if (tensor.type() == typeid(float)) {
      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
    } else if (tensor.type() == typeid(int64_t)) {
      printer << tensor.data<int64_t>()[i] << " ";
    } else if (tensor.type() == typeid(int8_t)) {
-      printer << tensor.data<int8_t>()[i] << " ";
+      printer << static_cast<int32_t>(tensor.data<int8_t>()[i]) << " ";
    }
  }
 #endif

--- a/src/operators/elementwise_sub_op.cpp
+++ b/src/operators/elementwise_sub_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/elementwise_sub_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ElementwiseSubOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/elementwise_sub_op.h
+++ b/src/operators/elementwise_sub_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "kernel/elementwise_sub_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ElementwiseSubOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseSubParam<DeviceType>,
+                             operators::ElementwiseSubKernel<DeviceType, T>> {
+ public:
+  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseSubParam<DeviceType>,
+            operators::ElementwiseSubKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseSubParam<DeviceType>,
+      operators::ElementwiseSubKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/kernel/elementwise_sub_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ElementwiseSubKernel<CPU, float>::Compute(
+    const ElementwiseSubParam<CPU> &param) const {
+  ElementwiseSubCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
  param.Out()->set_lod(param.InputX()->lod());
 }

+template class MulKernel<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct SubFunctor {
+  inline T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename P>
+void ElementwiseSubCompute(const ElementwiseSubParam<CPU> &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<SubFunctor<float>, float>(input_x, input_y, axis,
+                                                 SubFunctor<float>(), Out);
+}
+
+template class ElementwiseSubKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *out = param.Out();
-  out->mutable_data<float>();
+
  const Tensor x_matrix =
      input_x->dims().size() > 2
          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
  if (out_dim.size() != 2) {
    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
+  if (param.InputX()->type() == typeid(int8_t)) {
+    out->mutable_data<int32_t>();
+    math::matmul<int8_t>(x_matrix, false, y_matrix, false,
+                         static_cast<int8_t>(1), out, static_cast<int8_t>(0));
+
+  } else {
+    out->mutable_data<float>();
+    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                        out, static_cast<float>(0));
+  }
  if (out_dim.size() != 2) {
    out->Resize(out_dim);
  }
 }

-template class MulKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sum_arm_func.h
@@ -27,13 +27,11 @@ void SumCompute(const SumParam<CPU> &param) {
  auto *outvar = param.OutVar();

  bool in_place = outvar == inputsvars[0];
-  DLOG << "11:";
  if (outvar->IsType<framework::LoDTensor>()) {
    auto *out = outvar->GetMutable<LoDTensor>();
    if (!in_place) {
      out->mutable_data<float>();
    }
-    DLOG << "1:";
    auto *outptr = out->data<float>();
    // auto result = Flatten(*out);

@@ -62,7 +60,6 @@ void SumCompute(const SumParam<CPU> &param) {
    }

  } else if (outvar->IsType<framework::SelectedRows>()) {
-    DLOG << "2:";
    std::unique_ptr<framework::SelectedRows> in0;
    if (in_place) {
      // If is in_place, we store the input[0] to in0
@@ -119,12 +116,12 @@ void SumCompute(const SumParam<CPU> &param) {
      if (sel_row.rows().size() == 0) {
        continue;
      }
-      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height());
+      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(),
+                            "seletrows height != outheight");
      functor(sel_row, offset, out);
      offset += sel_row.value().numel();
    }
  } else if (outvar->IsType<LoDTensorArray>()) {
-    DLOG << "3:";
    auto &out_array = *outvar->GetMutable<LoDTensorArray>();
    for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) {
      PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType<LoDTensorArray>(),
@@ -140,7 +137,8 @@ void SumCompute(const SumParam<CPU> &param) {
            framework::TensorCopy((*in_array)[i], &out_array[i]);
            out_array[i].set_lod((*in_array)[i].lod());
          } else {
-            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod());
+            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(),
+                                  "outLod != inLod");
            auto *inptr = (*in_array)[i].data<float>();
            auto *outptr = out_array[i].data<float>();

@@ -152,9 +150,7 @@ void SumCompute(const SumParam<CPU> &param) {
      }
    }
  } else {
-    DLOG << "2:";
    if (outvar->IsType<framework::Tensor>()) {
-      DLOG << "3: ";
    }
    PADDLE_MOBILE_THROW_EXCEPTION(
        "Unexpected branch, output variable type is %s", outvar->Type().name());

--- a/src/operators/kernel/elementwise_sub_kernel.h
+++ b/src/operators/kernel/elementwise_sub_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ElementwiseSubKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseSubParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseSubParam<DeviceType> &param) const;
+  bool Init(ElementwiseSubParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
  b_ptr = b;
  int kc1 = k / 8;
  int kc2 = k % 8;
-  int step = 4 * ldc;
+  int step = sizeof(float) * ldc;
  asm volatile(
      "pld        [%[a_ptr]]            \n\t"
      "pld        [%[a_ptr],  #64]      \n\t"
@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      :
      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
        [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");

 #endif  // __aarch64__
-#else

 #endif  // __ARM_NEON
 }

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -96,6 +96,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
                            float *c, float *C, int ldc, float *p,
                            std::string mode, float *bias, float *bias1);
+
  /*
  // 向量矩阵乘法 (M = 1)
  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
@@ -139,6 +140,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                       float *new_scale, float *new_bias);
  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
                          float *new_scale, float *new_bias, float *bias1);
+
  /*
  // 向量矩阵乘法结果回写
  // C = A * B
@@ -185,15 +187,63 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                          const float *B, int ldb, float *C, int ldc, float *p,
                          std::string mode, float *bias, float *bias1);

+  // 8 bits function cluster begins
+  // 8 bits int small block inner product
+  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                 int32_t ldc);
+
+  // 8 bits int inner product
+  void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
+                           const int8_t *a, const int8_t *b, int8_t beta,
+                           int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                           int8_t *bias);
+
+  // 8 bits int pack function
+  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                      int32_t lda, int8_t *buffer);
+  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                      int32_t ldb, int8_t *buffer);
+
+  // 8 bits int matrix product
+  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
+             int32_t ldc, bool relu, int8_t *bias);
+
+  // 8 bits int write back
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc);
+  // C = A * B
+  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
+  // C = A * B + C
+  void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                    int32_t ldc);
+  // C = A * B + bias
+  void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                      int32_t ldc, int8_t *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                        int32_t ldc);
+  // C = A * B + bias, relu(C)
+  void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc, int8_t *bias);
+
 private:
  int MC = 0;
  int KC = 0;
  int NC = 0;

+  // 32位 float
  float *packedA;
  float *packedB;
  float *packedC;
  float *zero;
+
+  // 8 bits int
+  int8_t *packedA_int8;
+  int8_t *packedB_int8;
+  int32_t *packedC_int8;
+  int8_t *zero_int8;
 };

 }  // namespace math

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -135,7 +135,7 @@ template <typename T>
 struct ClearTensor<CPU, T> {
  void operator()(framework::Tensor *tensor) {
    auto size = tensor->numel();
-    auto *tensor_data = tensor->data<float>();
+    auto *tensor_data = tensor->data<T>();
    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
  }
 };
@@ -151,9 +151,9 @@ struct RowwiseAdd<CPU, T> {
    PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
                          "output->dims() must be equal to in_dims.");

-    auto *input_data = input.data<float>();
-    auto *out_data = output->data<float>();
-    auto *vec_data = vector.data<float>();
+    auto *input_data = input.data<T>();
+    auto *out_data = output->data<T>();
+    auto *vec_data = vector.data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
        out_data[i * size + j] = input_data[i * size + j] + vec_data[j];

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -25,7 +25,7 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false,
-            float *bias = nullptr);
+            T *bias = nullptr);

 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <>
+void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
+                    const framework::Tensor &matrix_b, bool trans_b,
+                    int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
+                    bool relu, int8_t *bias) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_MOBILE_ENFORCE(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+      "The input and output of matmul be matrix");
+
+  int32_t M = dim_out[0];
+  int32_t N = dim_out[1];
+  int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
+  Gemm gemm;
+
+  if (trans_a) {
+    int32_t numel = matrix_a.numel();
+    int32_t m = matrix_a.dims()[0];
+    int32_t n = matrix_a.dims()[1];
+    int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>());  // NOLINT
+    int8_t *a = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
+    int32_t index = 0;
+    for (int32_t j = 0; j < n; j++) {
+      for (int32_t i = 0; i < m; i++) {
+        a[index++] = tmp[i * n + j];
+      }
+    }
+
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+               matrix_out->data<int32_t>(), N, relu, bias);
+  } else {
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
+               relu, bias);
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/selected_rows_functor.h
+++ b/src/operators/math/selected_rows_functor.h
@@ -47,7 +47,7 @@ struct SelectedRowsAddTo {
                  const int64_t input2_offset,
                  framework::SelectedRows* input2) {
    auto in1_height = input1.height();
-    PADDLE_MOBILE_ENFORCE(in1_height == input2->height());
+    PADDLE_MOBILE_ENFORCE(in1_height == input2->height(), "height error");

    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());
@@ -77,13 +77,14 @@ struct SelectedRowsAddToTensor {
                  framework::Tensor* input2) {
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
-    PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0]);
+    PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0], "height != dims[0]");

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height);
+    PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height,
+                          "row_numel error");

    auto* in1_data = in1_value.data<T>();
    auto* input2_data = input2->data<T>();

--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -32,7 +32,8 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
    LOG(kLOG_ERROR) << "Predict bboxes must be equal";
  }
  // pre size, will change in Compute.
-  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
+  this->param_.Out()->Resize(
+      framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2}));
 }

 }  // namespace operators

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -471,15 +471,6 @@ class ElementwiseMulParam : OpParam {
  GType *input_y_;
  GType *out_;
  int axis_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::EWMulArgs fpga_EW_mul_args;
-
- public:
-  const fpga::EWMulArgs &FpgaArgs() const { return fpga_EW_mul_args; }
-  void SetFpgaArgs(const fpga::EWMulArgs &args) { fpga_EW_mul_args = args; }
-#endif
 };
 #endif

@@ -488,6 +479,38 @@ template <typename Dtype>
 using ElementwiseAddReluParam = ElementwiseAddParam<Dtype>;
 #endif

+#ifdef ELEMENTWISESUB_OP
+template <typename Dtype>
+class ElementwiseSubParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ElementwiseSubParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  const GType *InputY() const { return input_y_; }
+
+  GType *Out() const { return out_; }
+
+  const int &Axis() const { return axis_; }
+
+ private:
+  GType *input_x_;
+  GType *input_y_;
+  GType *out_;
+  int axis_;
+};
+#endif
+
 #ifdef MUL_OP
 template <typename Dtype>
 class MulParam : OpParam {
@@ -596,15 +619,6 @@ class SumParam : public OpParam {
  Variable *out_var_;
  vector<GType *> inputs_;
  GType *out_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SumArgs fpga_sum_args;
-
- public:
-  const fpga::SumArgs &FpgaArgs() const { return fpga_sum_args; }
-  void SetFpgaArgs(const fpga::SumArgs &args) { fpga_sum_args = args; }
-#endif
 };
 #endif


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -173,6 +173,14 @@ if (NOT FOUND_MATCH)
    target_link_libraries(test-elementwiseadd-op paddle-mobile)

    # gen test
+    ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-elementwisesub-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-im2sequence-op paddle-mobile)
+
+	# gen test
    ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-concat-op paddle-mobile)

@@ -262,6 +270,10 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
    target_link_libraries(test-gemm-accuracy paddle-mobile)

+    # gen test
+    ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
+    target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
+
    # gen test
    ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
    target_link_libraries(test-gemm-perf paddle-mobile)

--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
  }

  paddle_mobile::operators::math::Gemm gemm;
-  gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
+  gemm.SgemmWithBn(m, n, k, 1, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
                   nullptr);
  int eq = 0;
  int neq = 0;

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include <random>
+#include "../test_helper.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+using std::default_random_engine;
+using std::uniform_int_distribution;
+
+void print_matirx(int m, int n, int ldc, int32_t *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << c(i, 0);
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << c(i, j);
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+void print_matirx(int m, int n, int ldc, int8_t *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << static_cast<int32_t>(c(i, 0));
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << static_cast<int32_t>(c(i, j));
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+int do_sgemm(int m, int n, int k, bool relu, int pr) {
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  default_random_engine e;
+  uniform_int_distribution<int8_t> pixel(-127, 127);
+  int8_t *a = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
+  int8_t *b = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
+  int32_t *c = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
+  int32_t *c1 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
+
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = pixel(e);
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = pixel(e);
+  }
+
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      int32_t r = 0;
+      for (int p = 0; p < k; p++) {
+        r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
+      }
+      c1(i, j) = r;
+    }
+  }
+
+  paddle_mobile::operators::math::Gemm gemm;
+  gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
+             static_cast<int8_t>(0), c, ldc, relu, nullptr);
+  int eq = 0;
+  int neq = 0;
+  for (int i = 0; i < m * n; ++i) {
+    if (c[i] == c1[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matirx(m, k, lda, a);
+    std::cout << "B:" << std::endl;
+    print_matirx(k, n, ldb, b);
+    std::cout << "C:" << std::endl;
+    print_matirx(m, n, ldc, c);
+    std::cout << "C1:" << std::endl;
+    print_matirx(m, n, ldc, c1);
+  }
+
+  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
+            << "   eq=" << eq << " neq=" << neq << std::endl;
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+
+  return 0;
+}
+
+int main() {
+  do_sgemm(9, 9, 9, false, 10);
+  do_sgemm(10, 6, 12, false, 0);
+  do_sgemm(512, 256, 384, false, 0);
+  do_sgemm(1366, 768, 256, false, 0);
+  do_sgemm(1255, 755, 333, false, 0);
+  do_sgemm(555, 777, 999, false, 0);
+  do_sgemm(1024, 1024, 1024, false, 0);
+
+  return 0;
+}
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,13 +28,11 @@ limitations under the License. */

 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  Tensor aa, bb, cc, scale, bias;
+  paddle_mobile.SetThreadNum(1);
+  Tensor aa, bb, cc;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});
  auto ccptr = cc.mutable_data<float>({m, n});
-  auto scaleptr = scale.mutable_data<float>({m});
-  auto biasptr = bias.mutable_data<float>({m});

  for (int i = 0; i < m * k; ++i) {
    aaptr[i] = 2;
@@ -45,23 +43,55 @@ int main() {
  for (int i = 0; i < m * n; ++i) {
    ccptr[i] = 2;
  }
-  for (int i = 0; i < m; ++i) {
-    scaleptr[i] = 1;
-    biasptr[i] = 0;
+
+  Tensor aa_int8, bb_int8, cc_int8;
+  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
+  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
+  auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
+
+  for (int i = 0; i < m * k; ++i) {
+    aaptr_int8[i] = static_cast<int8_t>(2);
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr_int8[i] = static_cast<int8_t>(2);
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr_int8[i] = static_cast<int32_t>(2);
  }

-  auto time1 = time();
+  // float
+  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul<float>(
        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, biasptr);
+        false, nullptr);
+  }

-    //    paddle_mobile::operators::math::matmulWithBn<float>(
-    //        aa, false, bb, false, static_cast<float>(1), &cc,
-    //        static_cast<float>(0), true, &scale, &bias, 0);
+  auto time1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float>(
+        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
+        false, nullptr);
  }
  auto time2 = time();
-  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+  std::cout << "float gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+
+  // int8_t
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<int8_t>(
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
+        static_cast<int8_t>(0), false, nullptr);
+  }
+
+  auto time3 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<int8_t>(
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
+        static_cast<int8_t>(0), false, nullptr);
+  }
+  auto time4 = time();
+  std::cout << "int8_t gemm  cost :" << time_diff(time3, time4) / 10 << "ms\n";

  return 0;
 }
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/elementwise_sub_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestElementwiseSubOp {
+ public:
+  explicit TestElementwiseSubOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "elementwise_sub" &&
+            op->Input("X")[0] == "sigmoid_1.tmp_0") {
+          DLOG << " elementwise_sub attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+
+          std::shared_ptr<operators::ElementwiseSubOp<Dtype, float>> lrn =
+              std::make_shared<operators::ElementwiseSubOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("tmp_0");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *x2_feed_value = scope->Var("sigmoid_1.tmp_0");
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *output = scope->Var("tmp_1");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1, 1, 6, 6});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict_bn(t1, t2, 0);
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestElementwiseSubOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run ElementwiseSub Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  /// input x1 (1,1,6,6)
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {1, 1, 6, 6}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  /// input x2 (1,1,6,6)
+  paddle_mobile::framework::Tensor inputx2;
+  SetupTensor<float>(&inputx2, {1, 1, 6, 6}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx2_ptr = inputx2.data<float>();
+
+  paddle_mobile::framework::TestElementwiseSubOp<paddle_mobile::CPU>
+      testElementwiseSubOp(program);
+
+  auto output_op = testElementwiseSubOp.predict_bn(inputx1, inputx2);
+  auto *output_op_ptr = output_op->data<float>();
+
+  auto inputx1_dim = inputx1.numel() / inputx1.dims()[0];
+  DLOG << " input1 : ";
+  for (int i = 0; i < inputx1.dims()[0]; ++i) {
+    for (int j = 0; j < inputx1_dim; ++j) {
+      DLOGF("%f ", inputx1_ptr[i * inputx1_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
+  auto inputx2_dim = inputx2.numel() / inputx2.dims()[0];
+  DLOG << " input2 : ";
+  for (int i = 0; i < inputx2.dims()[0]; ++i) {
+    for (int j = 0; j < inputx2_dim; ++j) {
+      DLOGF("%f ", inputx2_ptr[i * inputx2_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
+  auto output_dim = output_op->numel() / output_op->dims()[0];
+  DLOG << " output : ";
+  for (int i = 0; i < output_op->dims()[0]; ++i) {
+    for (int j = 0; j < output_dim; ++j) {
+      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
+  return 0;
+}
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
@@ -12,51 +12,129 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "../executor_for_test.h"
+#pragma once
+
+#include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/im2sequence_op.h"

-int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_ocr_recg);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
+namespace paddle_mobile {
+namespace framework {

-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
-      executor(program, "im2sequence");
+template <typename Dtype>
+class TestIm2SequenceOp {
+ public:
+  explicit TestIm2SequenceOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }

-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "im2sequence" &&
+            op->Input("X")[0] == "conv2d_19.tmp_1") {
+          DLOG << " im2squence attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();

-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {2, 2, 3, 3}, -1, 1);
-  input_tensors.push_back(input1);
+          std::shared_ptr<operators::Im2SequenceOp<Dtype, float>> lrn =
+              std::make_shared<operators::Im2SequenceOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }

-  // 2. input_names
-  vector<string> input_names({
-      "conv2d_19.tmp_1",
-  });
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);

-  // 3. output_names
-  vector<string> output_names({"im2sequence_0.tmp_0"});
+    Variable *output = scope->Var("im2sequence_0.tmp_0");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({2, 12});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();

-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({8, 9});
-  out_ddims.push_back(out_ddim);
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);

-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
+    predict_bn(t1, 0);
+    return out_tensor;
+  }

-  auto output0_data = output[0]->data<float>();
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;

-  for (int j = 0; j < input_tensors[0].numel(); ++j) {
-    DLOG << " value of input: " << input1_data[j];
+  void predict_bn(const Tensor &t1, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
  }
+};
+
+template class TestIm2SequenceOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile

-  for (int j = 0; j < output[0]->numel(); ++j) {
-    DLOG << " value of output: " << output0_data[j];
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Im2Sequence Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_eng) + "/model",
+                             std::string(g_eng) + "/params");
+
+  /// input x (4,10,2,2)
+  paddle_mobile::framework::Tensor inputx;
+  SetupTensor<float>(&inputx, {1, 2, 6, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx_ptr = inputx.data<float>();
+
+  paddle_mobile::framework::TestIm2SequenceOp<paddle_mobile::CPU>
+      testIm2SequenceOp(program);
+
+  auto output_op = testIm2SequenceOp.predict_bn(inputx);
+  auto *output_op_ptr = output_op->data<float>();
+
+  auto input_dim = inputx.numel() / inputx.dims()[0];
+  DLOG << " input : ";
+  for (int i = 0; i < inputx.dims()[0]; ++i) {
+    for (int j = 0; j < input_dim; ++j) {
+      DLOGF("%f ", inputx_ptr[i * input_dim + j]);
+    }
+    DLOGF("\n");
  }
+
+  auto output_dim = output_op->numel() / output_op->dims()[0];
+  DLOG << " output : ";
+  for (int i = 0; i < output_op->dims()[0]; ++i) {
+    for (int j = 0; j < output_dim; ++j) {
+      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
+    }
+    DLOGF("\n");
+  }
+
  return 0;
 }
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,80 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <stdint-gcc.h>
+#include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"

-int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_resnet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
-      executor(program, "mul");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
-  input_tensors.push_back(input1);
-  Tensor input2;
-  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
-  input_tensors.push_back(input2);
-
-  // 2. input_names
-  vector<string> input_names({
-      "pool2d_0.tmp_0",
-      "fc_0.w_0",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"fc_0.tmp_0"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  auto dim_1 = input1.numel() / input1.dims()[0];
-  DLOG << " input1 : ";
-  for (int i = 0; i < input1.dims()[0]; ++i) {
-    for (int j = 0; j < dim_1; ++j) {
-      DLOGF("%f ", input1_data[i * dim_1 + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto dim_2 = input2.numel() / input2.dims()[0];
-  DLOG << " input2 : ";
-  for (int i = 0; i < input2.dims()[0]; ++i) {
-    for (int j = 0; j < dim_2; ++j) {
-      DLOGF("%f ", input2_data[i * dim_2 + j]);
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+
+namespace paddle_mobile {
+using framework::AttributeMap;
+using framework::DDim;
+using framework::Scope;
+using framework::make_ddim;
+template <typename I, typename O>
+int TestMulOP() {
+  int32_t m = 1024;
+  int32_t n = 1024;
+  int32_t k = 1024;
+  int32_t lda = k;
+  int32_t ldb = n;
+  int32_t ldc = n;
+  DDim inputA_shape = make_ddim({m, k});
+  DDim inputB_shape = make_ddim({k, n});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<Scope>();
+  inputs["X"] = std::vector<std::string>({"inputA"});
+  inputs["Y"] = std::vector<std::string>({"inputB"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto inputA_var = scope.get()->Var("inputA");
+  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<I>(inputA, inputA_shape, -127, 127);
+  auto inputB_var = scope.get()->Var("inputB");
+  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<I>(inputB, inputB_shape, -127, 127);
+
+  auto output_var = scope.get()->Var("output");
+  AttributeMap attrs;
+  attrs["x_num_col_dims"].Set<int>(1);
+  attrs["y_num_col_dims"].Set<int>(1);
+  auto *op =
+      new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const O *output_data = output->data<O>();
+  // compare
+  O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
+  I *a = inputA->data<I>();
+  I *b = inputB->data<I>();
+  for (int32_t i = 0; i < m; ++i) {
+    for (int32_t j = 0; j < n; ++j) {
+      O r = 0;
+      for (int32_t p = 0; p < k; p++) {
+        r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
+      }
+      c(i, j) = r;
    }
-    DLOGF("\n");
  }

-  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output[0]->dims()[0]; ++i) {
-    for (int j = 0; j < dim_output0; ++j) {
-      DLOGF("%f ", output0_data[i * dim_2 + j]);
+  int32_t eq = 0;
+  int32_t neq = 0;
+  for (int32_t i = 0; i < m * n; ++i) {
+    PADDLE_MOBILE_ENFORCE(
+        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
+        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
+    if (static_cast<int>(output_data[i] == c[i])) {
+      ++eq;
+    } else {
+      ++neq;
    }
-    DLOGF("\n");
  }
+  DLOG << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+       << " neq=" << neq;
+  delete op;
+  return 0;
+}
+}  // namespace paddle_mobile

-  /// output (3,3)
-  DLOG << "output memory size : " << output[0]->memory_size();
-  DLOG << "output numel : " << output[0]->numel();
-
-  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
-       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
+int main() {
+  paddle_mobile::TestMulOP<int8_t, int32_t>();
+  paddle_mobile::TestMulOP<float, float>();
  return 0;
 }
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -189,6 +189,8 @@ if(NOT FOUND_MATCH)
  set(CONV_OP ON)
  set(DEPTHWISECONV_OP ON)
  set(ELEMENTWISEADD_OP ON)
+  set(ELEMENTWISESUB_OP ON)
+  set(IM2SEQUENCE_OP ON)
  set(FUSION_CONVADD_OP ON)
  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
@@ -266,6 +268,9 @@ endif()
 if (ELEMENTWISEADD_OP)
  add_definitions(-DELEMENTWISEADD_OP)
 endif()
+if (ELEMENTWISESUB_OP)
+  add_definitions(-DELEMENTWISESUB_OP)
+endif()
 if (FUSION_CONVADD_OP)
  add_definitions(-DFUSION_CONVADD_OP)
 endif()