test=develop add matmul_op

c0d3605f · jiweibo · ca334444 · c0d3605f · c0d3605f · c0d3605f
11 changed file
--- a/lite/api/paddle_use_kernels.h
+++ b/lite/api/paddle_use_kernels.h
@@ -31,6 +31,7 @@ USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
 #ifdef LITE_WITH_ARM
 USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);

--- a/lite/api/paddle_use_ops.h
+++ b/lite/api/paddle_use_ops.h
@@ -19,6 +19,7 @@
 #include "paddle_lite_factory_helper.h"  // NOLINT
 USE_LITE_OP(mul);
+USE_LITE_OP(matmul);
 USE_LITE_OP(fc);
 USE_LITE_OP(relu);
 USE_LITE_OP(scale);

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -7,6 +7,7 @@ message(STATUS "compile with lite ARM kernels")
 lite_cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_library(activation_compute_arm SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(matmul_compute_arm SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -84,6 +85,7 @@ set(arm_kernels
    fc_compute_arm
    activation_compute_arm
    mul_compute_arm
+    matmul_compute_arm
    scale_compute_arm
    softmax_compute_arm
    conv_compute_arm

--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/arm/matmul_compute.h"
+#include <vector>
+#include "lite/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+static void NaiveTranspose(int m, int n, const float* src, float* dst) {
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      dst[j * m + i] = src[i * n + j];
+    }
+  }
+}
+void MatMulCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
+void MatMulCompute::Run() {
+  auto& param = Param<param_t>();
+  const auto* x_data = param.X->data<float>();
+  const auto* y_data = param.Y->data<float>();
+  auto* o_data = param.Out->mutable_data<float>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  auto o_dims = param.Out->dims();
+  bool x_transpose = param.transpose_X;
+  bool y_transpose = param.transpose_Y;
+  float alpha = param.alpha;
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  if (x_dims.size() > 2 && y_dims.size() >= 2) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    if (!x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ") x_transpose is " << x_transpose << "y_transpose is "
+          << y_transpose;
+    } else if (!x_transpose && y_transpose) {
+      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ") x_transpose is " << x_transpose << "y_transpose is "
+          << y_transpose;
+    } else if (x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 2])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ") x_transpose is " << x_transpose << "y_transpose is "
+          << y_transpose;
+    } else {
+      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ") x_transpose is " << x_transpose << "y_transpose is "
+          << y_transpose;
+    }
+    if (!x_transpose) {
+      m_ = x_dims[x_dims.size() - 2];
+      k_ = x_dims[x_dims.size() - 1];
+    } else {
+      m_ = x_dims[x_dims.size() - 1];
+      k_ = x_dims[x_dims.size() - 2];
+    }
+    if (!y_transpose) {
+      n_ = y_dims[y_dims.size() - 1];
+    } else {
+      n_ = y_dims[y_dims.size() - 2];
+    }
+    int hblock = lite::arm::math::get_hblock(ctx.arch());
+    int m_round = 0;
+    m_round = hblock * ((m_ + hblock - 1) / hblock);
+    ctx.ExtendWorkspace(m_round * k_ * sizeof(float));
+    int x_inner = x_dims[x_dims.size() - 2] * x_dims[x_dims.size() - 1];
+    int y_inner = y_dims[y_dims.size() - 2] * y_dims[y_dims.size() - 1];
+    int out_inner = o_dims[o_dims.size() - 2] * o_dims[o_dims.size() - 1];
+    float* x_data_trans = nullptr;
+    if (x_transpose) {
+      x_data_trans = static_cast<float*>(malloc(sizeof(float) * x_inner));
+    }
+    if (y_dims.size() > 2) {
+      if (n_ == 1) {
+        for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
+          if (x_transpose) {
+            NaiveTranspose(x_dims[x_dims.size() - 2],
+                           x_dims[x_dims.size() - 1],
+                           x_data + i * x_inner,
+                           x_data_trans);
+            lite::arm::math::sgemv(x_data_trans,
+                                   y_data + i * y_inner,
+                                   o_data + i * out_inner,
+                                   false,
+                                   m_,
+                                   k_,
+                                   false,
+                                   nullptr,
+                                   false);
+          } else {
+            lite::arm::math::sgemv(x_data + i * x_inner,
+                                   y_data + i * y_inner,
+                                   o_data + i * out_inner,
+                                   false,
+                                   m_,
+                                   k_,
+                                   false,
+                                   nullptr,
+                                   false);
+          }
+        }
+        if (fabsf(param.alpha - 1.f) > 1e-8f) {
+          for (size_t i = 0; i < param.Out->dims().production(); ++i) {
+            o_data[i] *= param.alpha;
+          }
+        }
+      } else {
+        float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                          ctx.llc_size() / sizeof(float);
+        for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
+          if (x_transpose) {
+            NaiveTranspose(x_dims[x_dims.size() - 2],
+                           x_dims[x_dims.size() - 1],
+                           x_data + i * x_inner,
+                           x_data_trans);
+            lite::arm::math::prepackA(packed_x,
+                                      x_data_trans,
+                                      alpha,
+                                      k_,
+                                      0,
+                                      m_,
+                                      0,
+                                      k_,
+                                      false,
+                                      &ctx);
+          } else {
+            lite::arm::math::prepackA(packed_x,
+                                      x_data + i * x_inner,
+                                      alpha,
+                                      k_,
+                                      0,
+                                      m_,
+                                      0,
+                                      k_,
+                                      false,
+                                      &ctx);
+          }
+          int ldb = n_;
+          if (y_transpose) {
+            ldb = k_;
+          }
+          lite::arm::math::sgemm_prepack(y_transpose,
+                                         m_,
+                                         n_,
+                                         k_,
+                                         packed_x,
+                                         y_data + i * y_inner,
+                                         ldb,
+                                         0.f,
+                                         o_data + i * out_inner,
+                                         n_,
+                                         nullptr,
+                                         false,
+                                         false,
+                                         &ctx);
+        }
+      }
+    } else {
+      if (n_ == 1) {
+        for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
+          if (x_transpose) {
+            NaiveTranspose(x_dims[x_dims.size() - 2],
+                           x_dims[x_dims.size() - 1],
+                           x_data + i * x_inner,
+                           x_data_trans);
+            lite::arm::math::sgemv(x_data_trans,
+                                   y_data,
+                                   o_data + i * out_inner,
+                                   false,
+                                   m_,
+                                   k_,
+                                   false,
+                                   nullptr,
+                                   false);
+          } else {
+            lite::arm::math::sgemv(x_data + i * x_inner,
+                                   y_data,
+                                   o_data + i * out_inner,
+                                   false,
+                                   m_,
+                                   k_,
+                                   false,
+                                   nullptr,
+                                   false);
+          }
+        }
+        if (fabsf(param.alpha - 1.f) > 1e-8f) {
+          for (size_t i = 0; i < param.Out->dims().production(); ++i) {
+            o_data[i] *= param.alpha;
+          }
+        }
+      } else {
+        float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                          ctx.llc_size() / sizeof(float);
+        for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
+          if (x_transpose) {
+            NaiveTranspose(x_dims[x_dims.size() - 2],
+                           x_dims[x_dims.size() - 1],
+                           x_data + i * x_inner,
+                           x_data_trans);
+            lite::arm::math::prepackA(
+                packed_x, x_data_trans, alpha, k_, 0, m_, 0, k_, false, &ctx);
+          } else {
+            lite::arm::math::prepackA(packed_x,
+                                      x_data + i * x_inner,
+                                      alpha,
+                                      k_,
+                                      0,
+                                      m_,
+                                      0,
+                                      k_,
+                                      false,
+                                      &ctx);
+          }
+          int ldb = n_;
+          if (y_transpose) {
+            ldb = k_;
+          }
+          lite::arm::math::sgemm_prepack(y_transpose,
+                                         m_,
+                                         n_,
+                                         k_,
+                                         packed_x,
+                                         y_data,
+                                         ldb,
+                                         0.f,
+                                         o_data + i * out_inner,
+                                         n_,
+                                         nullptr,
+                                         false,
+                                         false,
+                                         &ctx);
+        }
+      }
+    }
+    if (x_data_trans) {
+      free(x_data_trans);
+    }
+  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+    // x: [M, K], y: [K, N], out: [M, N]
+    if (!x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[1], y_dims[0])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    } else if (!x_transpose && y_transpose) {
+      CHECK_EQ(x_dims[1], y_dims[1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    } else if (x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[0], y_dims[0])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    } else {
+      CHECK_EQ(x_dims[0], y_dims[1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    }
+    if (!x_transpose) {
+      m_ = x_dims[0];
+      k_ = x_dims[1];
+    } else {
+      m_ = x_dims[1];
+      k_ = x_dims[0];
+    }
+    if (!y_transpose) {
+      n_ = y_dims[1];
+    } else {
+      n_ = y_dims[0];
+    }
+    int hblock = lite::arm::math::get_hblock(ctx.arch());
+    int m_round = 0;
+    m_round = hblock * ((m_ + hblock - 1) / hblock);
+    ctx.ExtendWorkspace(m_round * k_ * sizeof(float));
+    if (n_ == 1) {
+      // lite::arm::math::sgemv doesn't support transpose.
+      if (x_transpose) {
+        float* x_data_trans =
+            static_cast<float*>(malloc(sizeof(float) * x_dims[0] * x_dims[1]));
+        NaiveTranspose(x_dims[0], x_dims[1], x_data, x_data_trans);
+        lite::arm::math::sgemv(
+            x_data_trans, y_data, o_data, false, m_, k_, false, nullptr, false);
+      } else {
+        lite::arm::math::sgemv(
+            x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+      }
+      if (fabsf(param.alpha - 1.f) > 1e-8f) {
+        for (size_t i = 0; i < param.Out->dims().production(); ++i) {
+          o_data[i] *= param.alpha;
+        }
+      }
+    } else {
+      float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                        ctx.llc_size() / sizeof(float);
+      // prepackA seems that doesn't support transpose.
+      if (x_transpose) {
+        float* x_data_trans =
+            static_cast<float*>(malloc(sizeof(float) * x_dims[0] * x_dims[1]));
+        NaiveTranspose(x_dims[0], x_dims[1], x_data, x_data_trans);
+        lite::arm::math::prepackA(
+            packed_x, x_data_trans, alpha, k_, 0, m_, 0, k_, false, &ctx);
+      } else {
+        lite::arm::math::prepackA(
+            packed_x, x_data, alpha, k_, 0, m_, 0, k_, false, &ctx);
+      }
+      int ldb = n_;
+      if (y_transpose) {
+        ldb = k_;
+      }
+      lite::arm::math::sgemm_prepack(y_transpose,
+                                     m_,
+                                     n_,
+                                     k_,
+                                     packed_x,
+                                     y_data,
+                                     ldb,
+                                     0.f,
+                                     o_data,
+                                     n_,
+                                     nullptr,
+                                     false,
+                                     false,
+                                     &ctx);
+    }
+  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
+    // x: [B, M, K], y: [K], out: [B, M]
+    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
+        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+        << ")";
+    for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 1); ++i) {
+      o_data[i] = 0;
+      for (size_t j = 0; j < y_dims[0]; ++j) {
+        o_data[i] += x_data[i * y_dims[0] + j] * y_data[j] * alpha;
+      }
+    }
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {
+    // x: [K], y: [K], out: [1]
+    if (x_dims[0] == y_dims[0] && x_transpose == false &&
+        y_transpose == false) {
+      o_data[0] = 0.;
+      for (size_t i = 0; i < x_dims[0]; ++i) {
+        o_data[0] += x_data[i] * y_data[i] * alpha;
+      }
+    }
+    // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+    if (x_transpose == true && y_transpose == true) {
+      m_ = x_dims[0];
+      k_ = 1;
+      n_ = y_dims[0];
+      if (n_ == 1) {
+        lite::arm::math::sgemv(
+            x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+        if (fabsf(alpha - 1.f) > 1e-8f) {
+          for (size_t i = 0; i < param.Out->dims().production(); ++i) {
+            o_data[i] *= alpha;
+          }
+        }
+      } else {
+        float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                          ctx.llc_size() / sizeof(float);
+        lite::arm::math::prepackA(
+            packed_x, x_data, alpha, k_, 0, m_, 0, k_, false, &ctx);
+        int ldb = n_;
+        lite::arm::math::sgemm_prepack(false,
+                                       m_,
+                                       n_,
+                                       k_,
+                                       packed_x,
+                                       y_data,
+                                       ldb,
+                                       0.f,
+                                       o_data,
+                                       n_,
+                                       nullptr,
+                                       false,
+                                       false,
+                                       &ctx);
+      }
+    }
+  } else {
+    LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+               << ")";
+  }
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    matmul, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::MatMulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/matmul_compute.h
+++ b/lite/kernels/arm/matmul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+class MatMulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~MatMulCompute() = default;
+ private:
+  int m_, n_, k_;
+};
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -5,6 +5,7 @@ lite_cc_library(pool_op SRCS pool_op.cc DEPS ${op_DEPS})
 lite_cc_library(fc_op SRCS fc_op.cc DEPS ${op_DEPS})
 lite_cc_library(relu_op SRCS relu_op.cc DEPS ${op_DEPS})
 lite_cc_library(mul_op SRCS mul_op.cc DEPS ${op_DEPS})
+lite_cc_library(matmul_op SRCS matmul_op.cc DEPS ${op_DEPS})
 lite_cc_library(scale_op SRCS scale_op.cc DEPS ${op_DEPS})
 lite_cc_library(softmax_op SRCS softmax_op.cc DEPS ${op_DEPS})
 lite_cc_library(reshape_op SRCS reshape_op.cc DEPS ${op_DEPS} )
@@ -89,6 +90,7 @@ set(ops
        fc_op
        relu_op
        mul_op
+        matmul_op
        scale_op
        softmax_op
        reshape_op

--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/matmul_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool MatMulOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+bool MatMulOpLite::InferShape() const {
+  const auto x_dims = param_.X->dims();
+  const auto y_dims = param_.Y->dims();
+  bool x_transpose = param_.transpose_X;
+  bool y_transpose = param_.transpose_Y;
+  std::vector<int64_t> dim_out_vec;
+  if (x_dims.size() > 2 && y_dims.size() >= 2) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    if (!x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ")";
+    } else if (!x_transpose && y_transpose) {
+      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ")";
+    } else if (x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 2])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ")";
+    } else {
+      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << ")";
+    }
+    dim_out_vec.resize(x_dims.size());
+    for (size_t i = 0; i < x_dims.size() - 2; ++i) {
+      dim_out_vec[i] = x_dims[i];
+    }
+    if (!x_transpose && !y_transpose) {
+      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
+      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
+    } else if (!x_transpose && y_transpose) {
+      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
+      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
+    } else if (x_transpose && !y_transpose) {
+      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
+      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
+    } else {
+      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
+      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
+    }
+  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+    // x: [M, K], y: [K, N], out: [M, N]
+    // x: [M, K], y: [K, N], out: [M, N]
+    if (!x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[1], y_dims[0])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    } else if (!x_transpose && y_transpose) {
+      CHECK_EQ(x_dims[1], y_dims[1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    } else if (x_transpose && !y_transpose) {
+      CHECK_EQ(x_dims[0], y_dims[0])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    } else {
+      CHECK_EQ(x_dims[0], y_dims[1])
+          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+          << "), x_transpose is " << x_transpose << ", y_transpose is "
+          << y_transpose;
+    }
+    dim_out_vec.resize(x_dims.size());
+    if (x_transpose) {
+      dim_out_vec[0] = x_dims[1];
+    } else {
+      dim_out_vec[0] = x_dims[0];
+    }
+    if (y_transpose) {
+      dim_out_vec[1] = y_dims[0];
+    } else {
+      dim_out_vec[1] = y_dims[1];
+    }
+  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
+    // x: [B, M, K], y: [K], out: [B, M]
+    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
+        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+        << ")";
+    dim_out_vec.resize(x_dims.size() - 1);
+    for (size_t i = 0; i < dim_out_vec.size(); ++i) {
+      dim_out_vec[i] = x_dims[i];
+    }
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {  // todo
+    // x: [K], y: [K], out: [1]
+    if (x_dims[0] == y_dims[0] && x_transpose == false &&
+        y_transpose == false) {
+      dim_out_vec.resize(1);
+      dim_out_vec[0] = 1;
+    }
+    // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+    if (x_transpose == true && y_transpose == true) {
+      dim_out_vec.resize(2);
+      dim_out_vec[0] = x_dims[0];
+      dim_out_vec[1] = y_dims[0];
+    }
+  } else {
+    LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+               << ")";
+  }
+  DDim dim_out(dim_out_vec);
+  param_.Out->Resize(dim_out);
+  return true;
+}
+bool MatMulOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("Y").empty());
+  CHECK(!op_desc.Output("Out").empty());
+  auto X = op_desc.Input("X").front();
+  auto Y = op_desc.Input("Y").front();
+  auto Out = op_desc.Output("Out").front();
+  param_.X = GetVar<lite::Tensor>(scope, X);
+  param_.Y = GetVar<lite::Tensor>(scope, Y);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out);
+  param_.transpose_X = op_desc.GetAttr<bool>("transpose_X");
+  param_.transpose_Y = op_desc.GetAttr<bool>("transpose_Y");
+  param_.alpha = op_desc.GetAttr<float>("alpha");
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(matmul, paddle::lite::operators::MatMulOpLite);
--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class MatMulOpLite : public OpLite {
+ public:
+  MatMulOpLite() {}
+  explicit MatMulOpLite(const std::string &type) : OpLite(type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  std::string DebugString() const override { return "matmul"; }
+ private:
+  mutable MatMulParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -694,6 +694,16 @@ struct SliceParam {
  std::vector<int> ends{};
  std::vector<int> decrease_axis{};
 };
+/// ----------------------- matmul operators ----------------------
+struct MatMulParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  lite::Tensor* Out{};
+  bool transpose_X{false};
+  bool transpose_Y{false};
+  float alpha{1.0f};
+};
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -34,4 +34,5 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+namespace paddle {
+namespace lite {
+void matrix_mul(int m_,
+                int k_,
+                int n_,
+                float alpha,
+                const float* x,
+                const float* y,
+                float* out) {
+  for (int m = 0; m < m_; ++m) {
+    for (int n = 0; n < n_; ++n) {
+      out[m * n_ + n] = 0;
+      for (int k = 0; k < k_; ++k) {
+        out[m * n_ + n] += x[m * k_ + k] * y[k * n_ + n] * alpha;
+      }
+    }
+  }
+}
+void transpose(int m, int n, const float* src, float* dst) {
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      dst[j * m + i] = src[i * n + j];
+    }
+  }
+}
+void mul_low_efficiency(DDim x_dims_,
+                        DDim y_dims_,
+                        bool x_transpose_,
+                        bool y_transpose_,
+                        float alpha_,
+                        const float* x_data,
+                        const float* y_data,
+                        float* out_data) {
+  if (!x_transpose_ && !y_transpose_) {
+    CHECK_EQ(x_dims_[1], y_dims_[0])
+        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
+        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
+        << y_transpose_;
+    matrix_mul(
+        x_dims_[0], y_dims_[0], y_dims_[1], alpha_, x_data, y_data, out_data);
+  } else if (!x_transpose_ && y_transpose_) {
+    CHECK_EQ(x_dims_[1], y_dims_[1])
+        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
+        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
+        << y_transpose_;
+    float* y_data_trans =
+        static_cast<float*>(malloc(sizeof(float) * y_dims_[0] * y_dims_[1]));
+    transpose(y_dims_[0], y_dims_[1], y_data, y_data_trans);
+    matrix_mul(x_dims_[0],
+               x_dims_[1],
+               y_dims_[0],
+               alpha_,
+               x_data,
+               y_data_trans,
+               out_data);
+    free(y_data_trans);
+  } else if (x_transpose_ && !y_transpose_) {
+    CHECK_EQ(x_dims_[0], y_dims_[0])
+        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
+        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
+        << y_transpose_;
+    float* x_data_trans =
+        static_cast<float*>(malloc(sizeof(float) * x_dims_[0] * x_dims_[1]));
+    transpose(x_dims_[0], x_dims_[1], x_data, x_data_trans);
+    matrix_mul(x_dims_[1],
+               x_dims_[0],
+               y_dims_[1],
+               alpha_,
+               x_data_trans,
+               y_data,
+               out_data);
+    free(x_data_trans);
+  } else {
+    CHECK_EQ(x_dims_[0], y_dims_[1])
+        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
+        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
+        << y_transpose_;
+    float* x_data_trans =
+        static_cast<float*>(malloc(sizeof(float) * x_dims_[0] * x_dims_[1]));
+    float* y_data_trans =
+        static_cast<float*>(malloc(sizeof(float) * y_dims_[0] * y_dims_[1]));
+    transpose(x_dims_[0], x_dims_[1], x_data, x_data_trans);
+    transpose(y_dims_[0], y_dims_[1], y_data, y_data_trans);
+    matrix_mul(x_dims_[1],
+               x_dims_[0],
+               y_dims_[0],
+               alpha_,
+               x_data_trans,
+               y_data_trans,
+               out_data);
+    free(x_data_trans);
+    free(y_data_trans);
+  }
+}
+class MatMulComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "X";
+  std::string y_ = "Y";
+  std::string out_ = "Out";
+  DDim x_dims_;
+  DDim y_dims_;
+  bool x_transpose_;
+  bool y_transpose_;
+  float alpha_;
+ public:
+  MatMulComputeTester(const Place& place,
+                      const std::string& alias,
+                      bool x_transpose,
+                      bool y_transpose,
+                      float alpha,
+                      const DDim& x_dims,
+                      const DDim& y_dims)
+      : TestCase(place, alias),
+        x_transpose_(x_transpose),
+        y_transpose_(y_transpose),
+        alpha_(alpha),
+        x_dims_(x_dims),
+        y_dims_(y_dims) {}
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindTensor(x_);
+    auto* y = scope->FindTensor(y_);
+    CHECK(x);
+    CHECK(y);
+    const auto* x_data = x->data<float>();
+    const auto* y_data = y->data<float>();
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    std::vector<int64_t> dim_out_vec;
+    if (x_dims_.size() > 2 && y_dims_.size() >= 2) {
+      // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+      // x: [B, M, K], y: [K, N], out: [B, M, N]
+      dim_out_vec.resize(x_dims_.size());
+      for (size_t i = 0; i < x_dims_.size() - 2; ++i) {
+        dim_out_vec[i] = x_dims_[i];
+      }
+      if (!x_transpose_ && !y_transpose_) {
+        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 2];
+        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 1];
+      } else if (!x_transpose_ && y_transpose_) {
+        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 2];
+        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 2];
+      } else if (x_transpose_ && !y_transpose_) {
+        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 1];
+        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 1];
+      } else {
+        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 1];
+        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 2];
+      }
+      out->Resize(dim_out_vec);
+      auto* out_data = out->mutable_data<float>();
+      int x_inner = x_dims_[x_dims_.size() - 2] * x_dims_[x_dims_.size() - 1];
+      if (y_dims_.size() > 2) {
+        int y_inner = y_dims_[y_dims_.size() - 2] * y_dims_[y_dims_.size() - 1];
+        int o_inner = dim_out_vec[x_dims_.size() - 2] * dim_out_vec[x_dims_.size() - 1];
+        for (size_t i = 0; i < x_dims_.count(0, x_dims_.size() - 2); ++i) {
+          mul_low_efficiency(
+              DDim({x_dims_[x_dims_.size() - 2], x_dims_[x_dims_.size() - 1]}),
+              DDim({y_dims_[y_dims_.size() - 2], y_dims_[y_dims_.size() - 1]}),
+              x_transpose_,
+              y_transpose_,
+              alpha_,
+              x_data + i * x_inner,
+              y_data + i * y_inner,
+              out_data + i * o_inner);
+        }
+      } else {
+        int o_inner = dim_out_vec[x_dims_.size() - 2] * dim_out_vec[x_dims_.size() - 1];
+        for (size_t i = 0; i < x_dims_.count(0, x_dims_.size() - 2); ++i) {
+          mul_low_efficiency(
+              DDim({x_dims_[x_dims_.size() - 2], x_dims_[x_dims_.size() - 1]}),
+              y_dims_,
+              x_transpose_,
+              y_transpose_,
+              alpha_,
+              x_data + i * x_inner,
+              y_data,
+              out_data + i * o_inner);
+        }
+      }
+    } else if (x_dims_.size() == 2 && y_dims_.size() == 2) {
+      // x: [M, K], y: [K, N], out: [M, N]
+      dim_out_vec.resize(x_dims_.size());
+      if (x_transpose_) {
+        dim_out_vec[0] = x_dims_[1];
+      } else {
+        dim_out_vec[0] = x_dims_[0];
+      }
+      if (y_transpose_) {
+        dim_out_vec[1] = y_dims_[0];
+      } else {
+        dim_out_vec[1] = y_dims_[1];
+      }
+      out->Resize(dim_out_vec);
+      auto* out_data = out->mutable_data<float>();
+      mul_low_efficiency(x_dims_,
+                         y_dims_,
+                         x_transpose_,
+                         y_transpose_,
+                         alpha_,
+                         x_data,
+                         y_data,
+                         out_data);
+    } else if (x_dims_.size() > 2 && y_dims_.size() == 1) {
+      // x: [B, M, K], y: [K], out: [B, M]
+      CHECK_EQ(x_dims_[x_dims_.size() - 1], y_dims_[0])
+          << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
+          << ")";
+      dim_out_vec.resize(x_dims_.size() - 1);
+      for (size_t i = 0; i < dim_out_vec.size(); ++i) {
+        dim_out_vec[i] = x_dims_[i];
+      }
+      out->Resize(dim_out_vec);
+      auto* out_data = out->mutable_data<float>();
+      for (size_t i = 0; i < x_dims_.count(0, x_dims_.size() - 1); ++i) {
+        out_data[i] = 0;
+        for (size_t j = 0; j < y_dims_[0]; ++j) {
+          out_data[i] += x_data[i * y_dims_[0] + j] * y_data[j] * alpha_;
+        }
+      }
+    } else if (x_dims_.size() == 1 && y_dims_.size() == 1) {
+      // x: [K], y: [K], out: [1]
+      if (x_dims_[0] == y_dims_[0] && x_transpose_ == false &&
+          y_transpose_ == false) {
+        dim_out_vec.resize(1);
+        dim_out_vec[0] = 1;
+        out->Resize(dim_out_vec);
+        auto* out_data = out->mutable_data<float>();
+        out_data[0] = 0.f;
+        for (size_t i = 0; i < x_dims_[0]; ++i) {
+          out_data[0] += x_data[i] * y_data[i] * alpha_;
+        }
+      }
+      // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+      if (x_transpose_ == true && y_transpose_ == true) {
+        dim_out_vec.resize(2);
+        dim_out_vec[0] = x_dims_[0];
+        dim_out_vec[1] = y_dims_[0];
+        out->Resize(dim_out_vec);
+        auto* out_data = out->mutable_data<float>();
+        mul_low_efficiency(DDim({x_dims_[0], 1}),
+                           DDim({1, y_dims_[0]}),
+                           false,
+                           false,
+                           alpha_,
+                           x_data,
+                           y_data,
+                           out_data);
+      }
+    } else {
+      LOG(FATAL) << "not supported x_dims(" << x_dims_ << ") and y_dims("
+                 << y_dims_ << ")";
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("matmul");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("transpose_X", x_transpose_);
+    op_desc->SetAttr("transpose_Y", y_transpose_);
+    op_desc->SetAttr("alpha", alpha_);
+  }
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    std::vector<float> y_data(y_dims_.production());
+    for (int i = 0; i < x_dims_.production(); ++i) {
+      x_data[i] = 1; // i * 1.1;
+    }
+    for (int i = 0; i < y_dims_.production(); ++i) {
+      y_data[i] = 1; // i * 0.9;
+    }
+    SetCommonTensor(x_, x_dims_, x_data.data());
+    SetCommonTensor(y_, y_dims_, y_data.data());
+  }
+};
+void test_matmul2x2_no_transform(Place place) {
+  for (int m : {1, 2, 4, 8}) {
+    for (int k : {1, 3, 5}) {
+      for (int n : {1, 2, 4, 6}) {
+        for (float alpha : {1., 2.}) {
+          LOG(INFO) << "m: " << m << " k: " << k << " n: " << n
+                    << " alpha: " << alpha;
+          bool x_transform = false;
+          bool y_transform = false;
+          std::unique_ptr<arena::TestCase> tester(
+              new MatMulComputeTester(place,
+                                      "def",
+                                      x_transform,
+                                      y_transform,
+                                      alpha,
+                                      DDim({m, k}),
+                                      DDim({k, n})));
+          arena::Arena arena(std::move(tester), place, 5e-4);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+}
+void test_matmul2x2_x_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4}), DDim({2, 5})});
+  std::vector<DDim> y_dims({DDim({3, 2}), DDim({2, 1})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena.TestPrecision();
+  }
+}
+void test_matmul2x2_y_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({5, 2}), DDim({2, 5})});
+  std::vector<DDim> y_dims({DDim({3, 2}), DDim({1, 5})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena.TestPrecision();
+  }
+}
+void test_matmul2x2_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({6, 2}), DDim({5, 3})});
+  std::vector<DDim> y_dims({DDim({3, 6}), DDim({1, 5})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 5e-5);
+    arena.TestPrecision();
+  }
+}
+void test_matmul1x1_no_transpose(Place place) {
+  DDim x_dim({3});
+  DDim y_dim({3});
+  float alpha = 1.5f;
+  std::unique_ptr<arena::TestCase> tester(
+      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+void test_matmul1x1_transpose(Place place) {
+  DDim x_dim({3});
+  DDim y_dim({5});
+  float alpha = 1.5f;
+  std::unique_ptr<arena::TestCase> tester(
+      new MatMulComputeTester(place, "def", true, true, alpha, x_dim, y_dim));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+void test_matmul_nx1(Place place) {
+  DDim x_dim({3, 4, 2, 5});
+  DDim y_dim({5});
+  float alpha = 1.5f;
+  std::unique_ptr<arena::TestCase> tester(
+      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+void test_matmul_nx2_1(Place place) {
+  DDim x_dim({1, 2, 2, 3});
+  DDim y_dim({3, 1});
+  float alpha = 1.f;
+  std::unique_ptr<arena::TestCase> tester(
+      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+void test_matmul_nx2_2(Place place) {
+  DDim x_dim({1, 2, 2, 3});
+  DDim y_dim({3, 3});
+  float alpha = 1.5f;
+  std::unique_ptr<arena::TestCase> tester(
+      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+void test_matmulnx2_x_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
+  std::vector<DDim> y_dims({DDim({6, 2}), DDim({5, 1})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 2e-4);
+    arena.TestPrecision();
+  }
+}
+void test_matmulnx2_y_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
+  std::vector<DDim> y_dims({DDim({6, 2}), DDim({1, 2})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 5e-5);
+    arena.TestPrecision();
+  }
+}
+void test_matmulnx2_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
+  std::vector<DDim> y_dims({DDim({2, 4}), DDim({1, 3})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 5e-5);
+    arena.TestPrecision();
+  }
+}
+void test_matmul_nxn(Place place) {
+  DDim x_dim({3, 4, 2, 5});
+  DDim y_dim({3, 4, 5, 2});
+  float alpha = 1.5f;
+  std::unique_ptr<arena::TestCase> tester(
+      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
+  arena::Arena arena(std::move(tester), place, 1e-3);
+  arena.TestPrecision();
+}
+void test_matmulnxn_x_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
+  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 1})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 1e-3);
+    arena.TestPrecision();
+  }
+}
+void test_matmulnxn_y_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
+  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 1, 2})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 1e-3);
+    arena.TestPrecision();
+  }
+}
+void test_matmulnxn_transpose(Place place) {
+  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
+  std::vector<DDim> y_dims({DDim({3, 4, 2, 4}), DDim({5, 3, 1, 3})});
+  std::vector<float> alphas({1.f, 2.f});
+  for (int i = 0; i < x_dims.size(); ++i) {
+    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
+        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
+    arena::Arena arena(std::move(tester), place, 1e-3);
+    arena.TestPrecision();
+  }
+}
+TEST(Matmul2x2, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul2x2_no_transform(place);
+#endif
+}
+TEST(Matmul2x2_x_transpose, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul2x2_x_transpose(place);
+#endif
+}
+TEST(Matmul2x2_y_transpose, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul2x2_y_transpose(place);
+#endif
+}
+TEST(Matmul2x2_transpose, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul2x2_transpose(place);
+#endif
+}
+TEST(Matmul1x1, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul1x1_transpose(place);
+  test_matmul1x1_no_transpose(place);
+#endif
+}
+TEST(Matmulnx1, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul_nx1(place);
+#endif
+}
+TEST(Matmulnx2, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul_nx2_1(place);
+  test_matmul_nx2_2(place);
+  test_matmulnx2_x_transpose(place);
+  test_matmulnx2_y_transpose(place);
+  test_matmulnx2_transpose(place);
+#endif
+}
+TEST(Matmulnxn, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_matmul_nxn(place);
+  test_matmulnxn_x_transpose(place);
+  test_matmulnxn_y_transpose(place);
+  test_matmulnxn_transpose(place);
+#endif
+}
+}  // namespace lite
+}  // namespace paddle