[NPU] fix matmul_v2 and utils.run_check, test=develop (#36164)

* [NPU] fix matmul_v2 and utils.run_check, test=develop * remove debug files, test=develop * fix install_check, test=develop * fix doc, test=develop * fix review comments, test=develop

[NPU] fix matmul_v2 and utils.run_check, test=develop (#36164)
* [NPU] fix matmul_v2 and utils.run_check, test=develop * remove debug files, test=develop * fix install_check, test=develop * fix doc, test=develop * fix review comments, test=develop
7850f7ce · Qi Li · GitHub · 83541fd4 · 7850f7ce · 7850f7ce
6 changed file
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -21,166 +21,387 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y) {
+  Out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner =
+      NpuOpRunner("MatMul", {X, Y}, {*Out},
+                  {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
+  runner.Run(stream);
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y) {
+  Out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("BatchMatMul", {X, Y}, {*Out},
+                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+  runner.Run(stream);
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& brd_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = brd_dims.size();
+  int64_t diff = brd_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (brd_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out},
+                                   {{"axes", axes}, {"keep_dims", false}});
+  runner.Run(stream);
+}
+
+template <typename T>
 class MatMulV2NPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    bool transpose_x = ctx.Attr<bool>("trans_x");
-    bool transpose_y = ctx.Attr<bool>("trans_y");
-
-    if (x->dims().size() == 2) {
-      out->mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner(
-          "MatMul", {*x, *y}, {*out},
-          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* Out = ctx.Output<Tensor>("Out");
+    const bool trans_x = ctx.Attr<bool>("trans_x");
+    const bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();

-    } else if (x->dims().size() > 2) {
-      out->mutable_data<T>(ctx.GetPlace());
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();

-      const auto& runner =
-          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
-                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      PADDLE_ENFORCE_EQ(
+          X->numel(), Y->numel(),
+          platform::errors::InvalidArgument(
+              "X's numbers must be equal to Y's numbers,"
+              "when X/Y's dims =1. But received X has [%d] elements,"
+              "received Y has [%d] elements",
+              X->numel(), Y->numel()));
+      Out->Resize({1});
+      Out->mutable_data<T>(ctx.GetPlace());

-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
+      const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out});
      runner.Run(stream);
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (x_ndim == 2 && y_ndim == 2) {
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (trans_x == false && y_ndim == 2) {
+      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
+      x_temp.Resize(framework::make_ddim(vec_dim));
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
+      return;
+    }
+
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
+
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
+
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
+    }
+    MatMulND<T>(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y);
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T>
 class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    bool transpose_y = ctx.Attr<bool>("trans_y");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (x->dims().size() == 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", false}});
-
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    const bool trans_x = ctx.Attr<bool>("trans_x");
+    const bool trans_y = ctx.Attr<bool>("trans_y");

-          runner_dy.Run(stream);
-        }
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();

-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();

-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      Tensor dout_temp(dOut->type());
+      dout_temp.Resize(X->dims());
+      dout_temp.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner;
+      runner.SetType("BroadcastTo")
+          .AddInput(*dOut)
+          .AddInput(std::move(x_dims))
+          .AddOutput(dout_temp)
+          .Run(stream);

-          runner_dy.Run(stream);
+      if (dX) {
+        dX->mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {});
+        runner_dx.Run(stream);
+      }
+      if (dY) {
+        dY->mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {});
+        runner_dy.Run(stream);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_dims));
+        if (trans_x) {
+          MatMul2D<T>(ctx, stream, y_temp, dout_temp, dX, trans_y, true);
+        } else {
+          MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
        }
+        dX->Resize(X->dims());
      }
-    } else if (x->dims().size() > 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", false}});
-
-          runner_dx.Run(stream);
+      if (dY) {
+        dY->Resize(framework::make_ddim(y_dims));
+        if (trans_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, trans_x);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, !trans_x, false);
        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
-                          {{"adj_x1", true}, {"adj_x2", false}});
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];

-          runner_dy.Run(stream);
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (trans_x == false && y_ndim == 2) {
+      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
+      dout_temp.Resize(
+          framework::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_vec_dim));
+        MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        x_temp.Resize(framework::make_ddim(x_vec_dim));
+        if (trans_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false);
        }
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", true}});
+      }
+      return;
+    }
+
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
+
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }

-          runner_dx.Run(stream);
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
+    }
+
+    if (dX) {
+      if (x_dims == x_broadcast_dims) {
+        if (trans_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y);
        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          if ((x->dims().size() == 3) && (dout->dims().size() == 3) &&
-              (dy->dims().size() == 2)) {
-            framework::Tensor dout_tmp;
-            dout_tmp.ShareDataWith(*dout);
-            std::vector<int> vec_dim =
-                framework::vectorize<int>(dout_tmp.dims());
-            std::vector<int> vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]};
-            dout_tmp.Resize(framework::make_ddim(vec_dim_v));
-
-            framework::Tensor x_tmp;
-            x_tmp.ShareDataWith(*x);
-            std::vector<int> vec_dim_x =
-                framework::vectorize<int>(x_tmp.dims());
-            std::vector<int> vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1],
-                                         vec_dim_x[2]};
-            x_tmp.Resize(framework::make_ddim(vec_dim_x_v));
-            const auto& runner_dy =
-                NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy},
-                            {{"transpose_x1", true}, {"transpose_x2", false}});
-            runner_dy.Run(stream);
-          } else {
-            const auto& runner_dy =
-                NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
-                            {{"adj_x1", true}, {"adj_x2", false}});
-            runner_dy.Run(stream);
-          }
+      } else {
+        Tensor dx_temp(X->type());
+        dx_temp.Resize(framework::make_ddim(x_broadcast_dims));
+        if (trans_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y,
+                      true);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false,
+                      !trans_y);
        }
+        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
+      }
+    }
+    if (dY) {
+      if (y_dims == y_broadcast_dims) {
+        if (trans_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false);
+        }
+      } else {
+        Tensor dy_temp(Y->type());
+        dy_temp.Resize(framework::make_ddim(y_broadcast_dims));
+        if (trans_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true,
+                      trans_x);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x,
+                      false);
+        }
+        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
      }
    }
  }
 };
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;

-REGISTER_OP_NPU_KERNEL(
-    matmul_v2,
-    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    matmul_v2_grad,
-    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(matmul_v2, ops::MatMulV2NPUKernel<float>,
+                       ops::MatMulV2NPUKernel<paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(matmul_v2_grad, ops::MatMulV2GradNPUKernel<float>,
+                       ops::MatMulV2GradNPUKernel<paddle::platform::float16>);
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -55,6 +55,7 @@ __all__ = [
    'is_compiled_with_cuda',
    'is_compiled_with_rocm',
    'is_compiled_with_xpu',
+    'is_compiled_with_npu',
    'Variable',
    'require_version',
    'device_guard',
@@ -380,6 +381,15 @@ def _xpu_ids():
    return device_ids


+def _npu_ids():
+    npus_env = os.getenv("FLAGS_selected_npus")
+    if npus_env:
+        device_ids = [int(s) for s in npus_env.split(",")]
+    else:
+        device_ids = six.moves.range(core.get_npu_device_count())
+    return device_ids
+
+
 def is_compiled_with_xpu():
    """
    Whether this whl package can be used to run the model on XPU.
@@ -395,6 +405,21 @@ def is_compiled_with_xpu():
    return core.is_compiled_with_xpu()


+def is_compiled_with_npu():
+    """
+    Whether this whl package can be used to run the model on NPU.
+
+    Returns (bool): support npu or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            support_npu = fluid.is_compiled_with_npu()
+    """
+    return core.is_compiled_with_npu()
+
+
 def disable_signal_handler():
    """
    Reset signal handler registered by Paddle.
@@ -538,6 +563,47 @@ def xpu_places(device_ids=None):
    return [core.XPUPlace(dev_id) for dev_id in device_ids]


+def npu_places(device_ids=None):
+    """
+    **Note**:
+        For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
+    
+    This function creates a list of :code:`paddle.NPUPlace` objects.
+    If :code:`device_ids` is None, environment variable of
+    :code:`FLAGS_selected_npus` would be checked first. For example, if
+    :code:`FLAGS_selected_npus=0,1,2`, the returned list would
+    be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
+    If :code:`FLAGS_selected_npus` is not set, all visible
+    npu places would be returned.
+    If :code:`device_ids` is not None, it should be the device
+    ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
+    the returned list would be 
+    [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
+    
+    Parameters:
+        device_ids (list or tuple of int, optional): list of NPU device ids.
+    Returns:
+        list of paddle.NPUPlace: Created NPU place list.
+    Examples:
+        .. code-block:: python
+
+            # required: npu
+
+            import paddle
+            import paddle.static as static
+            
+            paddle.enable_static()
+            npu_places = static.npu_places()
+    """
+    assert core.is_compiled_with_npu(), \
+        "Not compiled with NPU"
+    if device_ids is None:
+        device_ids = _npu_ids()
+    elif not isinstance(device_ids, (list, tuple)):
+        device_ids = [device_ids]
+    return [core.NPUPlace(dev_id) for dev_id in device_ids]
+
+
 def cpu_places(device_count=None):
    """
    This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
@@ -1927,6 +1993,10 @@ class Variable(object):
            p = core.Place()
            p.set_place(t._place())
            place = core.XPUPlace(p.xpu_device_id())
+        elif p.is_npu_place():
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.NPUPlace(p.npu_device_id())
        else:
            p = core.Place()
            p.set_place(t._place())

--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -20,4 +20,5 @@ if (WITH_ASCEND_CL)
    set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300)
    set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
    set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
 endif()
--- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -21,56 +21,35 @@ sys.path.append("..")
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+from test_matmul_v2_op import reference_matmul

 paddle.enable_static()
 SEED = 2021


-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((Y.size))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if not Out.shape:
-        # We do not support 0-dimensional Tensors (scalars). So where
-        # np.matmul outputs a scalar, we must convert to a Tensor of
-        # shape (1) instead.
-        # Everywhere else, we are compatible with np.matmul.
-        Out = np.array([Out], dtype="float64")
-    return Out
-
-
-class TestMatMul(OpTest):
+class TestMatMulV2Op(OpTest):
+    """
+    case 1
+    """
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
    def config(self):
-        self.x_shape = (100, 24)
-        self.y_shape = (24, 100)
+        self.x_shape = (100, )
+        self.y_shape = (100, )
        self.trans_x = False
        self.trans_y = False

+    def init_kernel_type(self):
+        self.dtype = "float32"
+
    def setUp(self):
        self.set_npu()
-        self.op_type = "matmul_v2"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
+        self.init_kernel_type()
        self.config()
-        np.random.seed(SEED)
+        self.op_type = "matmul_v2"
        x = np.random.random(self.x_shape).astype(self.dtype)
        y = np.random.random(self.y_shape).astype(self.dtype)
        # -0.1 ~ 0.1
@@ -85,201 +64,314 @@ class TestMatMul(OpTest):
        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
        self.outputs = {'Out': result}

-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')


-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
-class TestMatMul2(TestMatMul):
+class TestMatMuklOp2(TestMatMulV2Op):
    """
    case 2
    """

    def config(self):
-        self.x_shape = (32, 24)
-        self.y_shape = (32, 24)
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
        self.trans_x = False
        self.trans_y = True


-class TestMatMul3(TestMatMul):
+class TestMatMuklOp3(TestMatMulV2Op):
    """
    case 3
    """

-    def init_dtype(self):
-        self.dtype = np.float16
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False


-class TestMatMul4(TestMatMul):
+class TestMatMuklOp4(TestMatMulV2Op):
    """
-    case 4 dim=3
+    case 4
    """

    def config(self):
-        self.x_shape = (2, 3, 4)
-        self.y_shape = (2, 4, 3)
+        self.x_shape = (100, )
+        self.y_shape = (1, 2, 100, 2)
        self.trans_x = False
        self.trans_y = False


-class TestMatMulNet(unittest.TestCase):
-    def _test(self, run_npu=True):
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        a_np = np.random.random(size=(2, 3)).astype('float32')
-        b_np = np.random.random(size=(2, 3)).astype('float32')
-        c_np = np.random.random(size=(3, 2)).astype('float32')
-        d_np = np.random.random(size=(3, 2)).astype('float32')
-        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
-
-        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
-            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
-            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
-            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
-
-            sum_1 = paddle.add(a, b)
-            sum_2 = paddle.add(c, d)
-            result = paddle.matmul(sum_1, sum_2)
-
-            fc_1 = fluid.layers.fc(input=result, size=8)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd.minimize(loss)
-
-        if run_npu:
-            place = paddle.NPUPlace(0)
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-        exe.run(startup_prog)
-
-        print("Start run on {}".format(place))
-        for epoch in range(100):
-
-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "c": c_np,
-                                             "d": d_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
-            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
-
-        return pred_res, loss_res
-
-    def test_npu(self):
-        cpu_pred, cpu_loss = self._test(False)
-        npu_pred, npu_loss = self._test(True)
-
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
-
-
-# The precision is aligned in NPU and GPU separately, which is only used for the usage method.
-
-
-class TestMatMulNet3_2(unittest.TestCase):
-    def _test(self, run_npu=True):
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-        self._dtype = "float32"
-
-        a_np = np.random.random(size=(2, 1, 3)).astype(self._dtype)
-        b_np = np.random.random(size=(2, 1, 3)).astype(self._dtype)
-        c_np = np.random.random(size=(3, 2)).astype(self._dtype)
-        d_np = np.random.random(size=(3, 2)).astype(self._dtype)
-        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
-
-        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[2, 1, 3], dtype=self._dtype)
-            b = paddle.static.data(name="b", shape=[2, 1, 3], dtype=self._dtype)
-            c = paddle.static.data(name="c", shape=[3, 2], dtype=self._dtype)
-            d = paddle.static.data(name="d", shape=[3, 2], dtype=self._dtype)
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
-
-            sum_1 = paddle.add(a, b)
-            sum_2 = paddle.add(c, d)
-            sum_1 = paddle.cast(sum_1, 'float16')
-            sum_2 = paddle.cast(sum_2, 'float16')
-            if not run_npu:
-                sum_1 = paddle.cast(sum_1, 'float32')
-                sum_2 = paddle.cast(sum_2, 'float32')
-
-            result = paddle.matmul(sum_1, sum_2)
-            if run_npu:
-                result = paddle.cast(result, 'float32')
-
-            result = paddle.reshape(result, shape=[2, 2])
-            fc_1 = fluid.layers.fc(input=result, size=8)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd.minimize(loss)
-
-        if run_npu:
+class TestMatMuklOp5(TestMatMulV2Op):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 102, 1)
+        self.y_shape = (102, )
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 1, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp8(TestMatMulV2Op):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 1, 100)
+        self.y_shape = (2, 1, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 25, 4)
+        self.y_shape = (1, 2, 4, 25)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 4, 25)
+        self.y_shape = (1, 1, 4, 25)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp13(TestMatMulV2Op):
+    """
+    case 13
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 10, 10)
+        self.y_shape = (2, 2, 10, 10)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 6, 6)
+        self.y_shape = (1, 2, 6, 9)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 6, 6)
+        self.y_shape = (1, 2, 6, 9)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp17(TestMatMulV2Op):
+    """
+    case 17 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+    """
+    case 14_3
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 10, 10)
+        self.y_shape = (1, 2, 10, 10)
+        self.trans_x = True
+        self.trans_y = True
+
+
+class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+    """
+    case 14_4
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 10, 10)
+        self.y_shape = (1, 2, 10, 10)
+        self.trans_x = False
+        self.trans_y = True
+
+
+#--------------------test matmul fp16--------------------
+
+
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulV2Op)
+create_test_fp16_class(TestMatMuklOp2)
+create_test_fp16_class(TestMatMuklOp3)
+create_test_fp16_class(TestMatMuklOp4)
+create_test_fp16_class(TestMatMuklOp5)
+create_test_fp16_class(TestMatMuklOp6)
+create_test_fp16_class(TestMatMuklOp7)
+create_test_fp16_class(TestMatMuklOp8)
+create_test_fp16_class(TestMatMuklOp9)
+create_test_fp16_class(TestMatMuklOp10)
+create_test_fp16_class(TestMatMuklOp11)
+create_test_fp16_class(TestMatMuklOp12)
+create_test_fp16_class(TestMatMuklOp13)
+create_test_fp16_class(TestMatMuklOp14)
+create_test_fp16_class(TestMatMuklOp15)
+create_test_fp16_class(TestMatMuklOp16)
+create_test_fp16_class(TestMatMuklOp17)
+
+
+class TestMatMulV2API(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_npu():
+            self.places.append(paddle.NPUPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32")
+            input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32")
+
+            result = paddle.matmul(input_x, input_y)
+
+            x_np = np.random.random([4, 3]).astype("float32")
+            y_np = np.random.random([3, 4]).astype("float32")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input_x": x_np,
+                                    "input_y": y_np},
+                              fetch_list=[result])
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float32")
+                input_y = np.random.random([3, 4]).astype("float32")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)
+
+    def test_dygraph_fp16(self):
+        if paddle.is_compiled_with_npu():
            place = paddle.NPUPlace(0)
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-        exe.run(startup_prog)
-
-        print("Start run on {}".format(place))
-        for epoch in range(100):
-
-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "c": c_np,
-                                             "d": d_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
-            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
-
-        return pred_res, loss_res
-
-    def test_npu(self):
-        cpu_pred, cpu_loss = self._test(False)
-        npu_pred, npu_loss = self._test(True)
-
-        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4))
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float16")
+                input_y = np.random.random([3, 4]).astype("float16")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)


 if __name__ == '__main__':

--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -43,6 +43,7 @@ from ..fluid.framework import program_guard  # noqa: F401
 from ..fluid.framework import cpu_places  # noqa: F401
 from ..fluid.framework import cuda_places  # noqa: F401
 from ..fluid.framework import xpu_places  # noqa: F401
+from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.layers.control_flow import Print  # noqa: F401
 from ..fluid.layers.nn import py_func  # noqa: F401
@@ -99,6 +100,7 @@ __all__ = [     #noqa
           'cpu_places',
           'cuda_places',
           'xpu_places',
+           'npu_places',
           'Variable',
           'create_global_var',
           'accuracy',

--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -74,7 +74,22 @@ def _is_cuda_available():
        return False


-def _run_dygraph_single(use_cuda):
+def _is_npu_available():
+    """
+    Check whether NPU is avaiable.
+    """
+    try:
+        assert len(paddle.static.npu_places()) > 0
+        return True
+    except Exception as e:
+        logging.warning(
+            "You are using NPU version PaddlePaddle, but there is no NPU "
+            "detected on your machine. Maybe NPU devices is not set properly."
+            "\n Original Error is {}".format(e))
+        return False
+
+
+def _run_dygraph_single(use_cuda, use_npu):
    """
    Testing the simple network in dygraph mode using one CPU/GPU.

@@ -84,6 +99,8 @@ def _run_dygraph_single(use_cuda):
    paddle.disable_static()
    if use_cuda:
        paddle.set_device('gpu')
+    elif use_npu:
+        paddle.set_device('npu')
    else:
        paddle.set_device('cpu')
    weight_attr = paddle.ParamAttr(
@@ -102,7 +119,7 @@ def _run_dygraph_single(use_cuda):
    opt.step()


-def _run_static_single(use_cuda):
+def _run_static_single(use_cuda, use_npu):
    """
    Testing the simple network with executor running directly, using one CPU/GPU.

@@ -119,8 +136,14 @@ def _run_static_single(use_cuda):
            param_grads = paddle.static.append_backward(
                out, parameter_list=[weight.name])[0]

-        exe = paddle.static.Executor(
-            paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
+        if use_cuda:
+            place = paddle.CUDAPlace(0)
+        elif use_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
        exe.run(startup_prog)
        exe.run(train_prog,
                feed={input.name: _prepare_data(1)},
@@ -128,7 +151,7 @@ def _run_static_single(use_cuda):
    paddle.disable_static()


-def _run_static_parallel(use_cuda, device_list):
+def _run_static_parallel(use_cuda, use_npu, device_list):
    """
    Testing the simple network in data parallel mode, using multiple CPU/GPU.

@@ -150,8 +173,15 @@ def _run_static_parallel(use_cuda, device_list):
            train_prog).with_data_parallel(
                loss_name=loss.name, places=device_list)

-        exe = paddle.static.Executor(
-            paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
+        if use_cuda:
+            place = paddle.CUDAPlace(0)
+        elif use_npu:
+            place = paddle.NPUPlace(0)
+            compiled_prog = train_prog
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
        exe.run(startup_prog)
        exe.run(compiled_prog,
                feed={input.name: _prepare_data(len(device_list))},
@@ -182,23 +212,31 @@ def run_check():

    if paddle.is_compiled_with_cuda():
        use_cuda = _is_cuda_available()
+        use_npu = False
+    elif paddle.is_compiled_with_npu():
+        use_npu = _is_npu_available()
+        use_cuda = False
    else:
+        use_npu = False
        use_cuda = False

    if use_cuda:
        device_str = "GPU"
        device_list = paddle.static.cuda_places()
+    elif use_npu:
+        device_str = "NPU"
+        device_list = paddle.static.npu_places()
    else:
        device_str = "CPU"
        device_list = paddle.static.cpu_places(device_count=2)
    device_count = len(device_list)

-    _run_static_single(use_cuda)
-    _run_dygraph_single(use_cuda)
+    _run_static_single(use_cuda, use_npu)
+    _run_dygraph_single(use_cuda, use_npu)
    print("PaddlePaddle works well on 1 {}.".format(device_str))

    try:
-        _run_static_parallel(use_cuda, device_list)
+        _run_static_parallel(use_cuda, use_npu, device_list)
        print("PaddlePaddle works well on {} {}s.".format(device_count,
                                                          device_str))
        print(