diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 63bdb4f57bb5d196daedc6d4c57737fdcb2ad3a1..1dd5ff60ccddaf1f2f35ae59d84f432a564c9443 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -20,6 +20,7 @@ set(npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_np
 
 lite_cc_library(subgraph_bridge_fc_op_npu SRCS fc_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_conv_op_npu SRCS conv_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_matmul_op_npu SRCS matmul_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_mul_op_npu SRCS mul_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_act_op_npu SRCS act_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_scale_op_npu SRCS scale_op.cc DEPS ${npu_subgraph_bridge_deps})
@@ -50,6 +51,7 @@ set(npu_subgraph_bridges
         subgraph_bridge_graph_npu
         subgraph_bridge_fc_op_npu
         subgraph_bridge_conv_op_npu
+        subgraph_bridge_matmul_op_npu
         subgraph_bridge_mul_op_npu
         subgraph_bridge_act_op_npu
         subgraph_bridge_scale_op_npu
diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e3d5ab2d7f2ada896896abcb2505f14c1d4dc28
--- /dev/null
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y_type = kernel->GetInputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+
+  if (x_dims.size() == 1 || x_dims.size() != y_dims.size()) {
+    LOG(WARNING)
+        << "[NPU] dims size of x and y must be same and greater than 1.";
+    return FAILED;
+  }
+  if (x_dims.size() > 2 &&
+      x_dims.count(0, x_dims.size() - 2) !=
+          y_dims.count(0, y_dims.size() - 2)) {
+    LOG(WARNING) << "[NPU] batched matmul only support the same batch size";
+    return FAILED;
+  }
+
+  auto out_name = op_info->Output("Out").front();
+  auto out_type = kernel->GetOutputDeclType("Out");
+  CHECK(out_type->precision() == PRECISION(kFloat));
+  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+
+  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
+  if (x_dims.size() > 2 && transpose_x) {
+    LOG(WARNING) << "[NPU] not support transpose_x == true if x_dims size "
+                    "greater than 2.";
+    return FAILED;
+  }
+  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
+  float alpha = op_info->GetAttr<float>("alpha");
+
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Y node which only supports 2-D persistable tensor
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+
+  // Matmul node
+  std::shared_ptr<Node> matmul_node = nullptr;
+  if (x_dims.size() == 2) {
+    matmul_node = graph->Add<ge::op::MatMul>(out_name);
+    auto matmul_op = matmul_node->data<ge::op::MatMul>();
+    matmul_op->set_input_x1(*x_node->data());
+    matmul_op->set_input_x2(*y_node->data());
+    matmul_op->set_attr_transpose_x1(transpose_x);
+    matmul_op->set_attr_transpose_x2(transpose_y);
+  } else {
+    matmul_node = graph->Add<ge::op::BatchMatMul>(out_name);
+    auto matmul_op = matmul_node->data<ge::op::BatchMatMul>();
+    matmul_op->set_input_x(*x_node->data());
+    matmul_op->set_input_y(*y_node->data());
+    matmul_op->set_attr_adj_x(transpose_x);
+    matmul_op->set_attr_adj_y(transpose_y);
+  }
+
+  if (fabs(alpha - 1.f) > 1e-6f) {
+    auto scaled_out_node = graph->Add<ge::op::Scale>(out_name);
+    auto scaled_out_op = scaled_out_node->data<ge::op::Scale>();
+    scaled_out_op->set_input_x(*matmul_node->data());
+    scaled_out_op->set_attr_axis(1);
+    std::vector<int64_t> scale_bias_shape(4, 1);
+    if (out_dims.size() < 4) {
+      scale_bias_shape[1] = out_dims[0];
+    } else if (out_dims.size() == 4) {
+      scale_bias_shape[1] = out_dims[1];
+    } else {
+      LOG(WARNING) << "[NPU] not support out dims size greater than 4.";
+      return FAILED;
+    }
+    auto filter_node =
+        graph->Add(out_name + "/filter", alpha, scale_bias_shape);
+    scaled_out_op->set_input_filter(*filter_node->data());
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(matmul,
+                         kNPU,
+                         paddle::lite::subgraph::npu::MatMulConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
index 3f8effa61928e522e80f7d74b38a8a672235a1f0..30d7b79c7e03dfb8176c3bdd098f35eef56a9afd 100644
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -41,6 +41,7 @@ USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fc, kNPU);
 USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(matmul, kNPU);
 USE_SUBGRAPH_BRIDGE(mul, kNPU);
 USE_SUBGRAPH_BRIDGE(pad2d, kNPU);
 USE_SUBGRAPH_BRIDGE(pool2d, kNPU);
diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h
index 556b5b9fee969d89436620fd8499659d7481f132..6d7dc5891fa6821f926b232633dc40f26efb7a2e 100644
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
@@ -94,6 +94,30 @@ REG_OP(Pad)
     .ATTR(epsilon, AttrValue::FLOAT{1e-7f})
     .OP_END()
 
+    /*
+     * Multiplies slices of two tensors in batches.
+     * <Input>
+     *      x : The input tensor
+     *      y : The input tensor
+     * <Output>
+     *      z : The output tensor
+     * <Attr>
+     *      adj_x : adj_x is true, the input tensor x  is  transposed, otherwise
+     * it will not be transposed. Default is false (The current version only
+     * supports false).
+     *      adj_y : adj_y is true, the input tensor y  is  transposed, otherwise
+     * it will not be transposed. Default is false.
+     * <Added in HiAI version>
+     *      100.320.010.010
+     */
+    REG_OP(BatchMatMul)
+    .INPUT(x, TensorType({DT_FLOAT}))
+    .INPUT(y, TensorType({DT_FLOAT}))
+    .OUTPUT(z, TensorType({DT_FLOAT}))
+    .ATTR(adj_x, AttrValue::BOOL{false})
+    .ATTR(adj_y, AttrValue::BOOL{false})
+    .OP_END()
+
 }  // namespace ge
 
 namespace paddle {
diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc
index 5d19e7fe3c023cdeea9f395a84f3ed53454c8c28..59b0fde8fd18b8a2170b6fdbd42444f09843f077 100644
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -120,27 +121,27 @@ class MatMulComputeTester : public arena::TestCase {
   // common attributes for this op.
   std::string x_ = "X";
   std::string y_ = "Y";
-  bool x_transpose_;
-  bool y_transpose_;
-  float alpha_;
   std::string out_ = "Out";
   DDim x_dims_;
   DDim y_dims_;
+  bool x_transpose_;
+  bool y_transpose_;
+  float alpha_;
 
  public:
   MatMulComputeTester(const Place& place,
                       const std::string& alias,
-                      bool x_transpose,
-                      bool y_transpose,
-                      float alpha,
                       const DDim& x_dims,
-                      const DDim& y_dims)
+                      const DDim& y_dims,
+                      bool x_transpose = false,
+                      bool y_transpose = false,
+                      float alpha = 1.f)
       : TestCase(place, alias),
+        x_dims_(x_dims),
+        y_dims_(y_dims),
         x_transpose_(x_transpose),
         y_transpose_(y_transpose),
-        alpha_(alpha),
-        x_dims_(x_dims),
-        y_dims_(y_dims) {}
+        alpha_(alpha) {}
 
   void RunBaseline(Scope* scope) override {
     auto* x = scope->FindTensor(x_);
@@ -295,215 +296,166 @@ class MatMulComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> x_data(x_dims_.production());
-    std::vector<float> y_data(y_dims_.production());
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
 
-    for (int i = 0; i < x_dims_.production(); ++i) {
-      x_data[i] = 1;  // i * 1.1;
-    }
-    for (int i = 0; i < y_dims_.production(); ++i) {
-      y_data[i] = 1;  // i * 0.9;
-    }
-
-    SetCommonTensor(x_, x_dims_, x_data.data());
-    SetCommonTensor(y_, y_dims_, y_data.data());
+    std::vector<float> y(y_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    SetCommonTensor(y_, y_dims_, y.data(), {}, true);
   }
 };
 
-void test_matmul2x2_no_transform(Place place) {
-  for (int m : {1, 2, 4, 8}) {
-    for (int k : {1, 3, 5}) {
-      for (int n : {1, 2, 4, 6}) {
+void test_matmul_helper(Place place,
+                        float abs_error,
+                        std::vector<int64_t> x_dims,
+                        std::vector<int64_t> y_dims,
+                        bool x_transpose,
+                        bool y_transpose,
+                        float alpha) {
+  std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(place,
+                                                                  "def",
+                                                                  DDim(x_dims),
+                                                                  DDim(y_dims),
+                                                                  x_transpose,
+                                                                  y_transpose,
+                                                                  alpha));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void test_matmul2x2(Place place, float abs_error) {
+  for (int64_t m : {1, 2, 8}) {
+    for (int64_t k : {1, 3, 5}) {
+      for (int64_t n : {1, 4, 6}) {
         for (float alpha : {1., 2.}) {
-          bool x_transform = false;
-          bool y_transform = false;
-          std::unique_ptr<arena::TestCase> tester(
-              new MatMulComputeTester(place,
-                                      "def",
-                                      x_transform,
-                                      y_transform,
-                                      alpha,
-                                      DDim({m, k}),
-                                      DDim({k, n})));
-          arena::Arena arena(std::move(tester), place, 5e-4);
-          arena.TestPrecision();
+          test_matmul_helper(
+              place, abs_error, {m, k}, {k, n}, false, false, alpha);
         }
       }
     }
   }
 }
 
-void test_matmul2x2_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4}), DDim({2, 5})});
-  std::vector<DDim> y_dims({DDim({3, 2}), DDim({2, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
+void test_matmul2x2_xtranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {3, 4}, {3, 2}, true, false, alpha);
+    test_matmul_helper(place, abs_error, {2, 5}, {2, 1}, true, false, alpha);
   }
 }
 
-void test_matmul2x2_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({5, 2}), DDim({2, 5})});
-  std::vector<DDim> y_dims({DDim({3, 2}), DDim({1, 5})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
+void test_matmul2x2_ytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {5, 2}, {3, 2}, false, true, alpha);
+    test_matmul_helper(place, abs_error, {2, 5}, {1, 5}, false, true, alpha);
   }
 }
 
-void test_matmul2x2_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({6, 2}), DDim({5, 3})});
-  std::vector<DDim> y_dims({DDim({3, 6}), DDim({1, 5})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
+void test_matmul2x2_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {6, 2}, {3, 6}, true, true, alpha);
+    test_matmul_helper(place, abs_error, {5, 3}, {1, 5}, true, true, alpha);
   }
 }
 
-void test_matmul1x1_no_transpose(Place place) {
-  DDim x_dim({3});
-  DDim y_dim({3});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmul1x1_transpose(Place place) {
-  DDim x_dim({3});
-  DDim y_dim({5});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", true, true, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmul1x1(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {3}, {3}, false, false, alpha);
+  }
 }
 
-void test_matmul_nx1(Place place) {
-  DDim x_dim({3, 4, 2, 5});
-  DDim y_dim({5});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmul1x1_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(place, abs_error, {3}, {5}, true, true, alpha);
+  }
 }
 
-void test_matmul_nx2_1(Place place) {
-  DDim x_dim({1, 2, 2, 3});
-  DDim y_dim({3, 1});
-  float alpha = 1.f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmulnx1(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 2, 5}, {5}, false, false, alpha);
+  }
 }
 
-void test_matmul_nx2_2(Place place) {
-  DDim x_dim({1, 2, 2, 3});
-  DDim y_dim({3, 3});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
+void test_matmulnx2(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {1, 2, 2, 3}, {3, 1}, false, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {1, 2, 2, 3}, {3, 4}, false, false, alpha);
+  }
 }
 
-void test_matmulnx2_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({6, 2}), DDim({5, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-4);
-    arena.TestPrecision();
+void test_matmulnx2_xtranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {6, 2}, true, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 5, 2}, {5, 1}, true, false, alpha);
   }
 }
 
-void test_matmulnx2_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({6, 2}), DDim({1, 2})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
+void test_matmulnx2_ytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {5, 2}, false, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 5, 2}, {1, 2}, false, true, alpha);
   }
 }
 
-void test_matmulnx2_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
-  std::vector<DDim> y_dims({DDim({2, 4}), DDim({1, 3})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
+void test_matmulnx2_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 4, 3}, {2, 4}, true, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 3, 2}, {1, 3}, true, true, alpha);
   }
 }
 
-void test_matmul_nxn(Place place) {
-  DDim x_dim({3, 4, 2, 5});
-  DDim y_dim({3, 4, 5, 2});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 1e-3);
-  arena.TestPrecision();
+void test_matmulnxn(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {3, 4, 2, 5}, false, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 4}, {5, 4, 6}, false, false, alpha);
+  }
 }
 
-void test_matmulnxn_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
+void test_matmulnxn_xtranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 2, 6}, {3, 4, 2, 5}, true, false, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 4, 2}, {5, 4, 6}, true, false, alpha);
   }
 }
 
-void test_matmulnxn_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 1, 2})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
+void test_matmulnxn_ytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 6, 2}, {3, 4, 5, 2}, false, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 3, 4}, {5, 6, 4}, false, true, alpha);
   }
 }
 
-void test_matmulnxn_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 2, 4}), DDim({5, 3, 1, 3})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
+void test_matmulnxn_xytranspose(Place place, float abs_error) {
+  for (float alpha : {1.f, 2.f}) {
+    test_matmul_helper(
+        place, abs_error, {3, 4, 2, 6}, {3, 4, 5, 2}, true, true, alpha);
+    test_matmul_helper(
+        place, abs_error, {5, 4, 3}, {5, 6, 4}, true, true, alpha);
   }
 }
 
 TEST(Matmul2x2, precision) {
   Place place;
-#if defined(LITE_WITH_ARM)
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -511,22 +463,31 @@ TEST(Matmul2x2, precision) {
   return;
 #endif
 
-  test_matmul2x2_no_transform(place);
+  test_matmul2x2(place, abs_error);
 }
 
 TEST(Matmul2x2_x_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_x_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmul2x2_xtranspose(place, abs_error);
 }
 
 TEST(Matmul2x2_y_transpose, precision) {
   Place place;
-#if defined(LITE_WITH_ARM)
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -534,65 +495,80 @@ TEST(Matmul2x2_y_transpose, precision) {
   return;
 #endif
 
-  test_matmul2x2_y_transpose(place);
+  test_matmul2x2_ytranspose(place, abs_error);
 }
 
 TEST(Matmul2x2_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmul2x2_xytranspose(place, abs_error);
 }
 
 TEST(Matmul1x1, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul1x1_transpose(place);
-  test_matmul1x1_no_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmul1x1(place, abs_error);
+  test_matmul1x1_xytranspose(place, abs_error);
 }
 
 TEST(Matmulnx1, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nx1(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmulnx1(place, abs_error);
 }
 
 TEST(Matmulnx2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nx2_1(place);
-  test_matmul_nx2_2(place);
-  test_matmulnx2_x_transpose(place);
-  test_matmulnx2_y_transpose(place);
-  test_matmulnx2_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmulnx2(place, abs_error);
+  test_matmulnx2_xtranspose(place, abs_error);
+  test_matmulnx2_ytranspose(place, abs_error);
+  test_matmulnx2_xytranspose(place, abs_error);
 }
 
 TEST(Matmulnxn, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nxn(place);
-  test_matmulnxn_x_transpose(place);
-  test_matmulnxn_y_transpose(place);
-  test_matmulnxn_transpose(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_matmulnxn(place, abs_error);
+  test_matmulnxn_xtranspose(place, abs_error);
+  test_matmulnxn_ytranspose(place, abs_error);
+  test_matmulnxn_xytranspose(place, abs_error);
 }
 
 }  // namespace lite