From db97773b6eb77a1a9b660dfe93171be3e1caf699 Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Tue, 20 Sep 2022 11:41:31 +0800
Subject: [PATCH] [NPU] fix run_program_op, test=develop (#46122)

* [NPU] fix run_program_op, test=develop

* [NPU] fix matmul_v2 in cann502, test=develop
---
 paddle/fluid/framework/parallel_executor.cc | 22 ++++++++-
 paddle/fluid/operators/matmul_v2_op_npu.cc  | 52 +++++++++++++++++++++
 2 files changed, 73 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index cfb92bb178..3b4fef8bcb 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -557,6 +557,20 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use IPU device since it's not compiled with IPU,"
           "Please recompile or reinstall Paddle with IPU support."));
+#endif
+    } else if (platform::is_npu_place(place)) {
+#if defined(PADDLE_WITH_ASCEND_CL)
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
+      } else {
+        gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use NPU device since it's not compiled with "
+          "NPU,"
+          "Please recompile or reinstall Paddle with NPU support."));
 #endif
     } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -1344,8 +1358,14 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
     device_name = "CPU";
   } else if (member_->use_device_ == p::kCUDA) {
     device_name = "CUDA";
-  } else {
+  } else if (member_->use_device_ == p::kNPU) {
+    device_name = "NPU";
+  } else if (member_->use_device_ == p::kXPU) {
     device_name = "XPU";
+  } else {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Only CPU/CUDA/NPU/XPU is supportted. "
+                                      "please use CPU/CUDA/NPU/XPU backend."));
   }
 
   VLOG(1) << string::Sprintf(
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index fd16f55765..291894bc30 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -57,6 +57,58 @@ static void MatMulND(const framework::ExecutionContext& ctx,
   runner.Run(stream);
 }
 
+#if (CANN_VERSION_CODE < 504000)
+template <>
+void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
+                                   const aclrtStream& stream,
+                                   const Tensor& X,
+                                   const Tensor& Y,
+                                   Tensor* Out,
+                                   const bool trans_x,
+                                   const bool trans_y) {
+  Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
+  Tensor x_fp32, y_fp32, out_fp32;
+  x_fp32.Resize(X.dims());
+  y_fp32.Resize(Y.dims());
+  out_fp32.Resize(Out->dims());
+  x_fp32.mutable_data<float>(ctx.GetPlace());
+  y_fp32.mutable_data<float>(ctx.GetPlace());
+  out_fp32.mutable_data<float>(ctx.GetPlace());
+
+  const auto& cast_x =
+      NpuOpRunner("Cast",
+                  {X},
+                  {x_fp32},
+                  {{"dst_type",
+                    static_cast<int>(ConvertToNpuDtype(
+                        framework::TransToProtoVarType(x_fp32.type())))}});
+  cast_x.Run(stream);
+  const auto& cast_y =
+      NpuOpRunner("Cast",
+                  {Y},
+                  {y_fp32},
+                  {{"dst_type",
+                    static_cast<int>(ConvertToNpuDtype(
+                        framework::TransToProtoVarType(y_fp32.type())))}});
+  cast_y.Run(stream);
+
+  const auto& runner = NpuOpRunner("BatchMatMul",
+                                   {x_fp32, y_fp32},
+                                   {out_fp32},
+                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+  runner.Run(stream);
+
+  const auto& cast_out = NpuOpRunner(
+      "Cast",
+      {out_fp32},
+      {*Out},
+      {{"dst_type",
+        static_cast<int>(
+            ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}});
+  cast_out.Run(stream);
+}
+#endif
+
 template <typename T>
 static void ReduceDims(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
-- 
GitLab