未验证 提交 db97773b 编写于 作者: R ronnywang 提交者: GitHub

[NPU] fix run_program_op, test=develop (#46122)

* [NPU] fix run_program_op, test=develop

* [NPU] fix matmul_v2 in cann502, test=develop
上级 192e7ccf
......@@ -557,6 +557,20 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use IPU device since it's not compiled with IPU,"
"Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (platform::is_npu_place(place)) {
#if defined(PADDLE_WITH_ASCEND_CL)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use NPU device since it's not compiled with "
"NPU,"
"Please recompile or reinstall Paddle with NPU support."));
#endif
} else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
......@@ -1344,8 +1358,14 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
device_name = "CPU";
} else if (member_->use_device_ == p::kCUDA) {
device_name = "CUDA";
} else {
} else if (member_->use_device_ == p::kNPU) {
device_name = "NPU";
} else if (member_->use_device_ == p::kXPU) {
device_name = "XPU";
} else {
PADDLE_THROW(
platform::errors::Unavailable("Only CPU/CUDA/NPU/XPU is supportted. "
"please use CPU/CUDA/NPU/XPU backend."));
}
VLOG(1) << string::Sprintf(
......
......@@ -57,6 +57,58 @@ static void MatMulND(const framework::ExecutionContext& ctx,
runner.Run(stream);
}
#if (CANN_VERSION_CODE < 504000)
template <>
void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
const Tensor& X,
const Tensor& Y,
Tensor* Out,
const bool trans_x,
const bool trans_y) {
Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
Tensor x_fp32, y_fp32, out_fp32;
x_fp32.Resize(X.dims());
y_fp32.Resize(Y.dims());
out_fp32.Resize(Out->dims());
x_fp32.mutable_data<float>(ctx.GetPlace());
y_fp32.mutable_data<float>(ctx.GetPlace());
out_fp32.mutable_data<float>(ctx.GetPlace());
const auto& cast_x =
NpuOpRunner("Cast",
{X},
{x_fp32},
{{"dst_type",
static_cast<int>(ConvertToNpuDtype(
framework::TransToProtoVarType(x_fp32.type())))}});
cast_x.Run(stream);
const auto& cast_y =
NpuOpRunner("Cast",
{Y},
{y_fp32},
{{"dst_type",
static_cast<int>(ConvertToNpuDtype(
framework::TransToProtoVarType(y_fp32.type())))}});
cast_y.Run(stream);
const auto& runner = NpuOpRunner("BatchMatMul",
{x_fp32, y_fp32},
{out_fp32},
{{"adj_x1", trans_x}, {"adj_x2", trans_y}});
runner.Run(stream);
const auto& cast_out = NpuOpRunner(
"Cast",
{out_fp32},
{*Out},
{{"dst_type",
static_cast<int>(
ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}});
cast_out.Run(stream);
}
#endif
template <typename T>
static void ReduceDims(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册