From db97773b6eb77a1a9b660dfe93171be3e1caf699 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Tue, 20 Sep 2022 11:41:31 +0800 Subject: [PATCH] [NPU] fix run_program_op, test=develop (#46122) * [NPU] fix run_program_op, test=develop * [NPU] fix matmul_v2 in cann502, test=develop --- paddle/fluid/framework/parallel_executor.cc | 22 ++++++++- paddle/fluid/operators/matmul_v2_op_npu.cc | 52 +++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index cfb92bb178..3b4fef8bcb 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -557,6 +557,20 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use IPU device since it's not compiled with IPU," "Please recompile or reinstall Paddle with IPU support.")); +#endif + } else if (platform::is_npu_place(place)) { +#if defined(PADDLE_WITH_ASCEND_CL) + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size)); + } else { + gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size)); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use NPU device since it's not compiled with " + "NPU," + "Please recompile or reinstall Paddle with NPU support.")); #endif } else if (platform::is_custom_place(place)) { #if defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -1344,8 +1358,14 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo( device_name = "CPU"; } else if (member_->use_device_ == p::kCUDA) { device_name = "CUDA"; - } else { + } else if (member_->use_device_ == p::kNPU) { + device_name = "NPU"; + } else if (member_->use_device_ == p::kXPU) { device_name = "XPU"; + } else { + PADDLE_THROW( + platform::errors::Unavailable("Only CPU/CUDA/NPU/XPU is supportted. " + "please use CPU/CUDA/NPU/XPU backend.")); } VLOG(1) << string::Sprintf( diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index fd16f55765..291894bc30 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -57,6 +57,58 @@ static void MatMulND(const framework::ExecutionContext& ctx, runner.Run(stream); } +#if (CANN_VERSION_CODE < 504000) +template <> +void MatMulND(const framework::ExecutionContext& ctx, + const aclrtStream& stream, + const Tensor& X, + const Tensor& Y, + Tensor* Out, + const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + Tensor x_fp32, y_fp32, out_fp32; + x_fp32.Resize(X.dims()); + y_fp32.Resize(Y.dims()); + out_fp32.Resize(Out->dims()); + x_fp32.mutable_data(ctx.GetPlace()); + y_fp32.mutable_data(ctx.GetPlace()); + out_fp32.mutable_data(ctx.GetPlace()); + + const auto& cast_x = + NpuOpRunner("Cast", + {X}, + {x_fp32}, + {{"dst_type", + static_cast(ConvertToNpuDtype( + framework::TransToProtoVarType(x_fp32.type())))}}); + cast_x.Run(stream); + const auto& cast_y = + NpuOpRunner("Cast", + {Y}, + {y_fp32}, + {{"dst_type", + static_cast(ConvertToNpuDtype( + framework::TransToProtoVarType(y_fp32.type())))}}); + cast_y.Run(stream); + + const auto& runner = NpuOpRunner("BatchMatMul", + {x_fp32, y_fp32}, + {out_fp32}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + runner.Run(stream); + + const auto& cast_out = NpuOpRunner( + "Cast", + {out_fp32}, + {*Out}, + {{"dst_type", + static_cast( + ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}}); + cast_out.Run(stream); +} +#endif + template static void ReduceDims(const framework::ExecutionContext& ctx, const aclrtStream& stream, -- GitLab