diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc
index 0e3d5ab2d7f2ada896896abcb2505f14c1d4dc28..4621f5955a841a0ba1b63381cb956242ce69639a 100644
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -35,14 +35,14 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto x_type = kernel->GetInputDeclType("X");
   CHECK(x_type->precision() == PRECISION(kFloat));
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
   auto y_type = kernel->GetInputDeclType("Y");
   CHECK(y_type->precision() == PRECISION(kFloat));
   CHECK(y_type->layout() == DATALAYOUT(kNCHW));
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
 
   if (x_dims.size() == 1 || x_dims.size() != y_dims.size()) {
@@ -50,6 +50,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         << "[NPU] dims size of x and y must be same and greater than 1.";
     return FAILED;
   }
+  if (y_dims.size() == 2 && !y->persistable()) {
+    LOG(WARNING) << "[NPU] y must be const if y is 2-D";
+    return FAILED;
+  }
   if (x_dims.size() > 2 &&
       x_dims.count(0, x_dims.size() - 2) !=
           y_dims.count(0, y_dims.size() - 2)) {
@@ -61,7 +65,7 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto out_type = kernel->GetOutputDeclType("Out");
   CHECK(out_type->precision() == PRECISION(kFloat));
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
 
   bool transpose_x = op_info->GetAttr<bool>("transpose_X");
@@ -80,7 +84,6 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     x_node = graph->Add(x_name, *x);
   }
 
-  // Y node which only supports 2-D persistable tensor
   std::shared_ptr<Node> y_node = nullptr;
   if (graph->Has(y_name)) {
     y_node = graph->Get(y_name);
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index 27df45819537faed291e108cc8a78a9a9de202cf..e7f497bd55bc302448528412f5cfb971001f79ca 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -36,18 +36,27 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto x_type = kernel->GetInputDeclType("X");
   CHECK(x_type->precision() == PRECISION(kFloat));
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
+
   auto y_name = op_info->Input("Y").front();
   auto y_type = kernel->GetInputDeclType("Y");
   CHECK(y_type->precision() == PRECISION(kFloat));
   CHECK(y_type->layout() == DATALAYOUT(kNCHW));
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
+
   auto out_name = op_info->Output("Out").front();
   auto out_type = kernel->GetOutputDeclType("Out");
   CHECK(out_type->precision() == PRECISION(kFloat));
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+  if (out_dims.size() > 4) {
+    LOG(WARNING) << "[NPU] not supported above 4-D.";
+    return FAILED;
+  }
+
   int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
   int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
   int m = x_dims.Slice(0, x_num_col_dims).production();
@@ -58,20 +67,20 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
   VLOG(3) << "x_name:" << x_name << ", is data: " << graph->Has(x_name);
   VLOG(3) << "y_name:" << y_name << ", is data: " << graph->Has(y_name);
-  CHECK(graph->Has(x_name))
-      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
 
   // X node which supports persistable and non-persistable tensor, and
   // reshape to (m, k)
   std::shared_ptr<Node> x_node = nullptr;
   if (graph->Has(x_name)) {
     x_node = graph->Get(x_name);
-    auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
-    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
-    reshaped_x_op->set_input_tensor(*x_node->data());
-    reshaped_x_op->set_attr_shape({m, k});
-    reshaped_x_op->set_attr_axis(0);
-    x_node = reshaped_x_node;
+    if (x_dims.size() != 2) {
+      auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+      auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+      reshaped_x_op->set_input_tensor(*x_node->data());
+      reshaped_x_op->set_attr_shape({m, k});
+      reshaped_x_op->set_attr_axis(0);
+      x_node = reshaped_x_node;
+    }
   } else {
     x_node = graph->Add(x_name, *x, {m, k});
   }
@@ -81,12 +90,14 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> y_node = nullptr;
   if (graph->Has(y_name)) {
     y_node = graph->Get(y_name);
-    auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
-    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
-    reshaped_y_op->set_input_tensor(*y_node->data());
-    reshaped_y_op->set_attr_shape({k, n});
-    reshaped_y_op->set_attr_axis(0);
-    y_node = reshaped_y_node;
+    if (y_dims.size() != 2) {
+      auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+      auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+      reshaped_y_op->set_input_tensor(*y_node->data());
+      reshaped_y_op->set_attr_shape({k, n});
+      reshaped_y_op->set_attr_axis(0);
+      y_node = reshaped_y_node;
+    }
   } else {
     y_node = graph->Add(y_name, *y, {k, n});
   }
@@ -96,6 +107,17 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto mul_op = mul_node->data<ge::op::MatMul>();
   mul_op->set_input_x1(*x_node->data());
   mul_op->set_input_x2(*y_node->data());
+
+  if (out_dims.size() != 2) {
+    auto reshaped_out_node = graph->Add<ge::op::Reshape>(out_name);
+    auto reshaped_out_op = reshaped_out_node->data<ge::op::Reshape>();
+    reshaped_out_op->set_input_tensor(*mul_node->data());
+    auto out_shape = out_dims.Vectorize();
+    reshaped_out_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+    reshaped_out_op->set_attr_axis(0);
+  }
+
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/npu/bridges/mul_op_test.cc b/lite/kernels/npu/bridges/mul_op_test.cc
deleted file mode 100644
index 9bcd72bb35b7bf5de19d880f4ad535fec8e480fa..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/mul_op_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/mul_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void mul_ref(const std::shared_ptr<operators::MulOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t x_num_col_dims = op_info->GetAttr<int32_t>("x_num_col_dims");
-  int32_t y_num_col_dims = op_info->GetAttr<int32_t>("y_num_col_dims");
-  auto x_data = x->mutable_data<float>();
-  auto y_data = y->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
-  auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims);
-  CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
-  const int M = x_mat_dims[0];
-  const int K = x_mat_dims[1];
-  const int N = y_mat_dims[1];
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
-      }
-    }
-  }
-}
-
-void test_mul(const std::vector<int64_t>& x_shape,
-              const std::vector<int64_t>& y_shape,
-              int x_num_col_dims,
-              int y_num_col_dims) {
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("mul"));
-
-  Scope scope;
-  std::string x_var_name("X");
-  std::string y_var_name("Y");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_shape);
-  y->Resize(y_shape);
-
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(y);
-
-  // create mul op
-  cpp::OpDesc mul_op_desc;
-  mul_op_desc.SetType("mul");
-  mul_op_desc.SetInput("X", {x_var_name});
-  mul_op_desc.SetInput("Y", {y_var_name});
-  mul_op_desc.SetOutput("Out", {out_var_name});
-  mul_op_desc.SetAttr("x_num_col_dims", static_cast<int>(x_num_col_dims));
-  mul_op_desc.SetAttr("y_num_col_dims", static_cast<int>(y_num_col_dims));
-
-  auto mul_op = CreateOp<operators::MulOpLite>(mul_op_desc, &scope);
-  LauchOp(mul_op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  mul_ref(mul_op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, mul) {
-  test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2);
-  test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2);
-  test_mul({1, 4, 1, 1}, {4, 8}, 1, 1);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_NPU_BRIDGE(mul);
diff --git a/lite/operators/mul_op.cc b/lite/operators/mul_op.cc
index 6067be5315220ec8b2f75265982e55f874e4b23a..c870abdc8989b48d8aa2f14f989ad475c027995e 100644
--- a/lite/operators/mul_op.cc
+++ b/lite/operators/mul_op.cc
@@ -32,21 +32,6 @@ bool MulOpLite::CheckShape() const {
   CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
   CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
 
-  // #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  //   auto x_mat_dims =
-  //       framework::flatten_to_2d(x_dims.data(), param_.x_num_col_dims);
-  //   auto y_mat_dims =
-  //       framework::flatten_to_2d(y_dims.data(), param_.y_num_col_dims);
-
-  //   PADDLE_ENFORCE_EQ(x_mat_dims[1],
-  //                     y_mat_dims[0],
-  //                     "First matrix's width must be equal with second
-  //                     matrix's"
-  //                     "height. %s, %s",
-  //                     x_mat_dims[1],
-  //                     y_mat_dims[0]);
-  // #endif
-
   return true;
 }
 
@@ -73,49 +58,8 @@ bool MulOpLite::InferShape() const {
   return true;
 }
 
-#ifdef LITE_WITH_TRAIN
-bool MulGradOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  CHECK_OR_FALSE(param_.output_grad);
-
-  return true;
-}
-
-bool MulGradOpLite::InferShape() const {
-  if (param_.x_grad) param_.x_grad->Resize(param_.x->dims());
-  if (param_.y_grad) param_.y_grad->Resize(param_.y->dims());
-  return true;
-}
-
-bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto X_name = op_desc.Input("X").front();
-  auto Y_name = op_desc.Input("Y").front();
-  auto Out_grad_name = op_desc.Input(framework::GradVarName("Out")).front();
-
-  if (op_desc.Output(framework::GradVarName("X")).size()) {
-    auto X_grad_name = op_desc.Output(framework::GradVarName("X")).front();
-    param_.x_grad = GetMutableVar<lite::Tensor>(scope, X_grad_name);
-  }
-
-  if (op_desc.Output(framework::GradVarName("Y")).size()) {
-    auto Y_grad_name = op_desc.Output(framework::GradVarName("Y")).front();
-    param_.y_grad = GetMutableVar<lite::Tensor>(scope, Y_grad_name);
-  }
-
-  param_.x = GetVar<lite::Tensor>(scope, X_name);
-  param_.y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.output_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-
-  return true;
-}
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_OP(mul, paddle::lite::operators::MulOpLite);
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
-#endif
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
index e70f443985536cb6493558cc6e9aee4584d969f5..d9bbfaa8d049cf2bbcdea9b9c5e58d201e156a67 100644
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -99,7 +99,7 @@ class MulComputeTester : public arena::TestCase {
 
     std::vector<float> y(y_dims_.production());
     fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
-    SetCommonTensor(y_, y_dims_, y.data());
+    SetCommonTensor(y_, y_dims_, y.data(), {}, true);
   }
 };
 
@@ -123,7 +123,10 @@ TEST(Mul, precision) {
   LOG(INFO) << "test mul op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
   return;