未验证 提交 8259d9bf 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] refine NpuOpRunner (#32869)

* refine ~npuOpRunner

* implement destructor and forbid copy

* use reference to avoid copy

* use const reference

* relax adam precision

* fix top_k
上级 78ecb668
...@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> { ...@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Power", {*x}, {*out}, const auto& runner = NpuOpRunner("Power", {*x}, {*out},
{{"power", factor}, {{"power", factor},
{"scale", static_cast<float>(1.0)}, {"scale", static_cast<float>(1.0)},
{"shift", static_cast<float>(0.0)}}); {"shift", static_cast<float>(0.0)}});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> { ...@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// Step1: Compute x_pow = x.pow(factor-1) // Step1: Compute x_pow = x.pow(factor-1)
Tensor x_pow(x->type()); Tensor x_pow(x->type());
x_pow.mutable_data<T>(x->dims(), place); x_pow.mutable_data<T>(x->dims(), place);
auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, const auto& runner_pow = NpuOpRunner(
{{"power", factor - static_cast<float>(1)}}); "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
runner_pow.Run(stream); runner_pow.Run(stream);
// Step 2: Construct a broadcast factor, which has the same shape with x. // Step 2: Construct a broadcast factor, which has the same shape with x.
...@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> { ...@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// factor. // factor.
Tensor factor_bc_tensor(framework::proto::VarType::FP32); Tensor factor_bc_tensor(framework::proto::VarType::FP32);
factor_bc_tensor.mutable_data<float>(x_dims, place); factor_bc_tensor.mutable_data<float>(x_dims, place);
auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, const auto& runner_bc =
{{"dims", framework::vectorize(x_dims)}}); NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
{{"dims", framework::vectorize(x_dims)}});
runner_bc.Run(stream); runner_bc.Run(stream);
// Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
Tensor x_power_mul_factor(x->type()); Tensor x_power_mul_factor(x->type());
x_power_mul_factor.mutable_data<T>(x->dims(), place); x_power_mul_factor.mutable_data<T>(x->dims(), place);
auto runner_mul_1 = const auto& runner_mul_1 =
NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
runner_mul_1.Run(stream); runner_mul_1.Run(stream);
// Step 4: Compute dx = dout * factor * x.pow(factor-1) // Step 4: Compute dx = dout * factor * x.pow(factor-1)
dx->mutable_data<T>(place); dx->mutable_data<T>(place);
auto runner_mul_2 = const auto& runner_mul_2 =
NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
runner_mul_2.Run(stream); runner_mul_2.Run(stream);
} }
...@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> { ...@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Relu", const auto& runner = NpuOpRunner("Relu",
{ {
*x, *x,
}, },
{*out}, {}); {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> { ...@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> {
.stream(); .stream();
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
runner.Run(stream); runner.Run(stream);
} }
...@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> { ...@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> { ...@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
dx_runner.Run(stream); runner_dx.Run(stream);
} }
}; };
...@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> { ...@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> {
Tensor one(x->type()); Tensor one(x->type());
one.mutable_data<T>(x->dims(), place); one.mutable_data<T>(x->dims(), place);
auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
one_runner.Run(stream); runner_one.Run(stream);
Tensor sub(x->type()); Tensor sub(x->type());
sub.mutable_data<T>(x->dims(), place); sub.mutable_data<T>(x->dims(), place);
auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
sub_runner.Run(stream); runner_sub.Run(stream);
auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
out_runner.Run(stream); runner_out.Run(stream);
} }
}; };
...@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> { ...@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> {
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> { ...@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> { ...@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
dx_runner.Run(stream); runner_dx.Run(stream);
} }
}; };
...@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> { ...@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
auto* float_status = ctx.Output<framework::Tensor>("FloatStatus"); auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
float_status->mutable_data<T>(ctx.GetPlace()); float_status->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); const auto& runner =
NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -58,7 +58,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -58,7 +58,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
Tensor inverse_out(scale->type()); Tensor inverse_out(scale->type());
inverse_out.Resize(scale->dims()); inverse_out.Resize(scale->dims());
inverse_out.mutable_data<T>(ctx.GetPlace()); inverse_out.mutable_data<T>(ctx.GetPlace());
auto runner_inverse = const auto& runner_inverse =
NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
runner_inverse.Run(stream); runner_inverse.Run(stream);
tmp_inverse_out = &inverse_out; tmp_inverse_out = &inverse_out;
...@@ -69,14 +69,14 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -69,14 +69,14 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder. // tmp is only placeholder.
auto runner_float_status = const auto& runner_float_status =
NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}, NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
{{"message", std::string("check_nan_and_inf")}}); {{"message", std::string("check_nan_and_inf")}});
runner_float_status.Run(stream); runner_float_status.Run(stream);
Tensor sum; Tensor sum;
sum.mutable_data<float>({1}, ctx.GetPlace()); sum.mutable_data<float>({1}, ctx.GetPlace());
auto runner_reduce_sum = const auto& runner_reduce_sum =
NpuOpRunner("ReduceSumD", {*float_status}, {sum}, NpuOpRunner("ReduceSumD", {*float_status}, {sum},
{{"axes", std::vector<int>{0}}, {"keep_dims", true}}); {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
runner_reduce_sum.Run(stream); runner_reduce_sum.Run(stream);
...@@ -95,7 +95,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -95,7 +95,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
if (!found_inf_data) { if (!found_inf_data) {
// MatMul // MatMul
auto runner_matmul = const auto& runner_matmul =
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_matmul.Run(stream); runner_matmul.Run(stream);
} }
...@@ -114,7 +114,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> { ...@@ -114,7 +114,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<platform::DeviceContext>(), found_inf); ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait(); ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
auto runner_clear_status = const auto& runner_clear_status =
NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
runner_clear_status.Run(stream); runner_clear_status.Run(stream);
} }
......
...@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx,
Tensor factor_tensor(bad_out_tensor->type()); Tensor factor_tensor(bad_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place); factor_tensor.mutable_data<int>({1}, place);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1)); FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, const auto& runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
{*bad_out_tensor}, {}); {*bad_out_tensor}, {});
runner_p2.Run(stream); runner_p2.Run(stream);
std::vector<int> bad_out_data; std::vector<int> bad_out_data;
TensorToVector(*bad_out_tensor, ctx, &bad_out_data); TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
if (bad_out_data[0] == decr_every_n_nan_or_inf) { if (bad_out_data[0] == decr_every_n_nan_or_inf) {
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", decr_ratio}, {"scale", decr_ratio},
{"shift", static_cast<float>(0)}}); {"shift", static_cast<float>(0)}});
runner_p3.Run(stream); runner_p3.Run(stream);
...@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx,
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (new_loss_scaling[0] < static_cast<T>(1)) { if (new_loss_scaling[0] < static_cast<T>(1)) {
// updated_loss_scaling_data = 1 // updated_loss_scaling_data = 1
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)}, {"scale", static_cast<float>(0)},
{"shift", static_cast<float>(1)}}); {"shift", static_cast<float>(1)}});
runner_p4.Run(stream); runner_p4.Run(stream);
} }
...@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx, ...@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx,
Tensor factor_tensor(good_out_tensor->type()); Tensor factor_tensor(good_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place); factor_tensor.mutable_data<int>({1}, place);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1)); FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, const auto& runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
{*good_out_tensor}, {}); {*good_out_tensor}, {});
runner_p2.Run(stream); runner_p2.Run(stream);
std::vector<int> good_out_data; std::vector<int> good_out_data;
TensorToVector(*good_out_tensor, ctx, &good_out_data); TensorToVector(*good_out_tensor, ctx, &good_out_data);
if (good_out_data[0] == incr_every_n_steps) { if (good_out_data[0] == incr_every_n_steps) {
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", incr_ratio}, {"scale", incr_ratio},
{"shift", static_cast<float>(0)}}); {"shift", static_cast<float>(0)}});
runner_p3.Run(stream); runner_p3.Run(stream);
std::vector<T> new_loss_scaling; std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (!std::isfinite(new_loss_scaling[0])) { if (!std::isfinite(new_loss_scaling[0])) {
// updated_loss_scaling_data = pre_loss_scaling_data // updated_loss_scaling_data = pre_loss_scaling_data
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)}, {{"power", static_cast<float>(1)},
{"scale", static_cast<float>(1)}, {"scale", static_cast<float>(1)},
{"shift", static_cast<float>(0)}}); {"shift", static_cast<float>(0)}});
runner_p4.Run(stream); runner_p4.Run(stream);
} }
...@@ -165,7 +165,7 @@ class LazyZerosNPU { ...@@ -165,7 +165,7 @@ class LazyZerosNPU {
} }
zero_tensor->mutable_data<T>(place); zero_tensor->mutable_data<T>(place);
auto runner_zeros = const auto& runner_zeros =
NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor}); NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
runner_zeros.Run(stream); runner_zeros.Run(stream);
zero_tensor->check_memory_size(); zero_tensor->check_memory_size();
......
...@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> { ...@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> { ...@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Cast", {*x}, {*out}, const auto& runner = NpuOpRunner(
{{"dst_type", static_cast<int32_t>(aclDtype)}}); "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -52,9 +52,11 @@ class ConcatNPUKernel : public framework::OpKernel<T> { ...@@ -52,9 +52,11 @@ class ConcatNPUKernel : public framework::OpKernel<T> {
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner( NpuOpRunner runner{
"ConcatD", {inputs}, {*out}, "ConcatD",
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}); {inputs},
{*out},
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
runner.AddInputNames(names); runner.AddInputNames(names);
runner.Run(stream); runner.Run(stream);
} }
...@@ -101,8 +103,9 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> { ...@@ -101,8 +103,9 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
sizes.push_back(ins[j]->dims()[dim]); sizes.push_back(ins[j]->dims()[dim]);
} }
} }
auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, const auto& runner =
{{"offsets", offsets}, {"size", sizes}}); NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
{{"offsets", offsets}, {"size", sizes}});
runner.Run(stream); runner.Run(stream);
} }
if (ins[j]->numel() != 0UL) { if (ins[j]->numel() != 0UL) {
......
...@@ -34,7 +34,7 @@ class EqualNPUKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,7 @@ class EqualNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace()); out->mutable_data<bool>(ctx.GetPlace());
auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
...@@ -51,7 +51,7 @@ class LessThanNPUKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class LessThanNPUKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<framework::LoDTensor>("Out"); auto* z = ctx.Output<framework::LoDTensor>("Out");
// int axis = context.Attr<int>("axis"); // int axis = context.Attr<int>("axis");
z->mutable_data<bool>(ctx.GetPlace()); // allocate z->mutable_data<bool>(ctx.GetPlace()); // allocate
auto runner = NpuOpRunner("Less", {*x, *y}, {*z}); const auto& runner = NpuOpRunner("Less", {*x, *y}, {*z});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -40,7 +40,7 @@ class LogicalNotNPUKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class LogicalNotNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -32,7 +32,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
...@@ -82,8 +82,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -82,8 +82,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
} }
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace()); reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, const auto& runner =
{{"axes", axes}, {"keep_dims", false}}); NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream); runner.Run(stream);
tmp_dout = &reduced_dout; tmp_dout = &reduced_dout;
} }
...@@ -96,8 +97,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -96,8 +97,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
} }
} }
if (axes.size() != 0) { if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}}); {{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
} else { } else {
framework::TensorCopy( framework::TensorCopy(
...@@ -123,8 +124,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -123,8 +124,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
} }
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace()); reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, const auto& runner =
{{"axes", axes}, {"keep_dims", false}}); NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream); runner.Run(stream);
tmp_dout = &reduced_dout; tmp_dout = &reduced_dout;
} }
...@@ -138,8 +140,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> { ...@@ -138,8 +140,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
} }
if (axes.size() != 0) { if (axes.size() != 0) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy}, const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
{{"axes", axes}, {"keep_dims", true}}); {{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
} else { } else {
framework::TensorCopy( framework::TensorCopy(
......
...@@ -40,7 +40,7 @@ class ElementwiseDivNPUKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -65,46 +65,47 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> { ...@@ -65,46 +65,47 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
Tensor y_power(y->type()); Tensor y_power(y->type());
y_power.mutable_data<T>(y->dims(), place); y_power.mutable_data<T>(y->dims(), place);
auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power}, const auto& runner_y_power = NpuOpRunner(
{{"power", static_cast<float>(-1)}}); "Power", {*y}, {y_power}, {{"power", static_cast<float>(-1)}});
y_power_runner.Run(stream); runner_y_power.Run(stream);
if (dx) { if (dx) {
dx->mutable_data<T>(place); dx->mutable_data<T>(place);
Tensor tensor_zeros(x->type()); Tensor tensor_zeros(x->type());
tensor_zeros.mutable_data<T>(x->dims(), place); tensor_zeros.mutable_data<T>(x->dims(), place);
auto tensor_zeros_runner = const auto& runner_tensor_zeros =
NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {}); NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
tensor_zeros_runner.Run(stream); runner_tensor_zeros.Run(stream);
Tensor x_zero(paddle::framework::proto::VarType::BOOL); Tensor x_zero(paddle::framework::proto::VarType::BOOL);
x_zero.mutable_data<bool>(x->dims(), place); x_zero.mutable_data<bool>(x->dims(), place);
auto x_zero_runner = const auto& runner_x_zero =
NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {}); NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
x_zero_runner.Run(stream); runner_x_zero.Run(stream);
Tensor x_nozero(paddle::framework::proto::VarType::BOOL); Tensor x_nozero(paddle::framework::proto::VarType::BOOL);
x_nozero.mutable_data<bool>(x->dims(), place); x_nozero.mutable_data<bool>(x->dims(), place);
auto x_nozero_runner = const auto& runner_x_nonzero =
NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {}); NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
x_nozero_runner.Run(stream); runner_x_nonzero.Run(stream);
Tensor x_nozero_f(x->type()); Tensor x_nozero_f(x->type());
x_nozero_f.mutable_data<T>(x->dims(), place); x_nozero_f.mutable_data<T>(x->dims(), place);
auto x_nozero_f_runner = const auto& runner_x_nonzero_f =
NpuOpRunner("Cast", {x_nozero}, {x_nozero_f}, NpuOpRunner("Cast", {x_nozero}, {x_nozero_f},
{{"dst_type", static_cast<int32_t>(0)}}); {{"dst_type", static_cast<int32_t>(0)}});
x_nozero_f_runner.Run(stream); runner_x_nonzero_f.Run(stream);
Tensor x_grad_w(x->type()); Tensor x_grad_w(x->type());
x_grad_w.mutable_data<T>(x->dims(), place); x_grad_w.mutable_data<T>(x->dims(), place);
auto x_grad_w_runner = const auto& runner_x_grad_w =
NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {}); NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {});
x_grad_w_runner.Run(stream); runner_x_grad_w.Run(stream);
auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {}); const auto& runner_x_grad =
x_grad_runner.Run(stream); NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
runner_x_grad.Run(stream);
} }
if (dy) { if (dy) {
...@@ -112,16 +113,18 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> { ...@@ -112,16 +113,18 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
Tensor neg_out(y->type()); Tensor neg_out(y->type());
neg_out.mutable_data<T>(y->dims(), place); neg_out.mutable_data<T>(y->dims(), place);
auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {}); const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
neg_out_runner.Run(stream); runner_neg_out.Run(stream);
Tensor y_grad_w(y->type()); Tensor y_grad_w(y->type());
y_grad_w.mutable_data<T>(y->dims(), place); y_grad_w.mutable_data<T>(y->dims(), place);
auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {}); const auto& runner_y_grad_w =
y_grad_w_runner.Run(stream); NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
runner_y_grad_w.Run(stream);
auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {}); const auto& runner_y_grad =
y_grad_runner.Run(stream); NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
runner_y_grad.Run(stream);
} }
} }
}; };
......
...@@ -37,7 +37,7 @@ class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> { ...@@ -37,7 +37,7 @@ class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -40,7 +40,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -40,7 +40,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -41,7 +41,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> { ...@@ -41,7 +41,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -65,14 +65,14 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> { ...@@ -65,14 +65,14 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
if (dx) { if (dx) {
dx->mutable_data<T>(place); dx->mutable_data<T>(place);
auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
dx_runner.Run(stream); runner_dx.Run(stream);
} }
if (dy) { if (dy) {
dy->mutable_data<T>(place); dy->mutable_data<T>(place);
auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
dy_runner.Run(stream); runner_dy.Run(stream);
} }
} }
}; };
......
...@@ -40,7 +40,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -33,7 +33,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {}); const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -84,8 +84,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -84,8 +84,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
} }
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace()); reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, const auto& runner =
{{"axes", axes}, {"keep_dims", false}}); NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream); runner.Run(stream);
tmp_dout = &reduced_dout; tmp_dout = &reduced_dout;
} }
...@@ -98,8 +99,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -98,8 +99,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
} }
} }
if (axes.size() != 0) { if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}}); {{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
} else { } else {
framework::TensorCopy( framework::TensorCopy(
...@@ -127,8 +128,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -127,8 +128,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
} }
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace()); reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, const auto& runner =
{{"axes", axes}, {"keep_dims", false}}); NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream); runner.Run(stream);
tmp_dout = &reduced_dout; tmp_dout = &reduced_dout;
} }
...@@ -144,14 +146,15 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -144,14 +146,15 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
if (axes.size() != 0) { if (axes.size() != 0) {
reduced_dy.Resize(dy->dims()); reduced_dy.Resize(dy->dims());
reduced_dy.mutable_data<T>(ctx.GetPlace()); reduced_dy.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, const auto& runner =
{{"axes", axes}, {"keep_dims", true}}); NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream); runner.Run(stream);
tmp_dy = &reduced_dy; tmp_dy = &reduced_dy;
} }
// stage 3, negative // stage 3, negative
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream); runner.Run(stream);
} }
} }
......
...@@ -11,7 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -11,7 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -65,7 +64,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> { ...@@ -65,7 +64,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
out0->Resize(out_dims); out0->Resize(out_dims);
out0->mutable_data<T>(context.device_context().GetPlace()); out0->mutable_data<T>(context.device_context().GetPlace());
auto runner = const auto& runner =
NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
auto stream = auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>() context.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -82,5 +81,3 @@ REGISTER_OP_NPU_KERNEL( ...@@ -82,5 +81,3 @@ REGISTER_OP_NPU_KERNEL(
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>); paddle::platform::float16>);
#endif
...@@ -68,8 +68,8 @@ class FillConstantNPUKernel : public framework::OpKernel<T> { ...@@ -68,8 +68,8 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
FillNpuTensorWithConstant<T>(&tensor_tmp, value); FillNpuTensorWithConstant<T>(&tensor_tmp, value);
out_var->mutable_data<T>(shape, place); out_var->mutable_data<T>(shape, place);
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
{{"dims", framework::vectorize(shape)}}); {{"dims", framework::vectorize(shape)}});
runner.Run(stream); runner.Run(stream);
} }
}; };
......
...@@ -33,8 +33,8 @@ class GatherOpNPUKernel : public framework::OpKernel<T> { ...@@ -33,8 +33,8 @@ class GatherOpNPUKernel : public framework::OpKernel<T> {
auto *out = ctx.Output<Tensor>("Out"); auto *out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Gather", {*x, *index}, {*out}, const auto &runner = NpuOpRunner("Gather", {*x, *index}, {*out},
{{"validate_indices", true}}); {{"validate_indices", true}});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
...@@ -75,7 +75,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> { ...@@ -75,7 +75,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
zeroslike_xout.numel() * sizeof(T), stream); zeroslike_xout.numel() * sizeof(T), stream);
// step3: scatter(x_grad) // step3: scatter(x_grad)
auto runner_scatter = NpuOpRunner( const auto &runner_scatter = NpuOpRunner(
"TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
runner_scatter.Run(stream); runner_scatter.Run(stream);
} }
......
...@@ -39,7 +39,7 @@ class GeluNPUKernel : public framework::OpKernel<T> { ...@@ -39,7 +39,7 @@ class GeluNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {}); const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
runner.Run(stream); runner.Run(stream);
} }
}; };
...@@ -63,11 +63,12 @@ class GeluGradNPUKernel : public framework::OpKernel<T> { ...@@ -63,11 +63,12 @@ class GeluGradNPUKernel : public framework::OpKernel<T> {
Tensor out(x->type()); Tensor out(x->type());
out.mutable_data<T>(x->dims(), place); out.mutable_data<T>(x->dims(), place);
auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {}); const auto& runner_out = NpuOpRunner("Gelu", {*x}, {out}, {});
out_runner.Run(stream); runner_out.Run(stream);
auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {}); const auto& runner_dx =
dx_runner.Run(stream); NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
runner_dx.Run(stream);
} }
}; };
......
...@@ -43,7 +43,7 @@ class IncrementalNPUKernel : public framework::OpKernel<T> { ...@@ -43,7 +43,7 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
step_tensor.mutable_data<T>({1}, context.GetPlace()); step_tensor.mutable_data<T>({1}, context.GetPlace());
FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step)); FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
auto runner = const auto& runner =
NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
auto stream = auto stream =
......
...@@ -81,7 +81,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -81,7 +81,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
Tensor value(x->type()); Tensor value(x->type());
value.mutable_data<T>({1}, place); value.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0)); FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
auto runner = const auto& runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream); runner.Run(stream);
scale = &default_scale; scale = &default_scale;
...@@ -95,7 +95,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -95,7 +95,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
Tensor value(x->type()); Tensor value(x->type());
value.mutable_data<T>({1}, place); value.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(0)); FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
auto runner = const auto& runner =
NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
runner.Run(stream); runner.Run(stream);
bias = &default_bias; bias = &default_bias;
...@@ -110,7 +110,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -110,7 +110,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
cast_scale.Resize(scale->dims()); cast_scale.Resize(scale->dims());
cast_scale.mutable_data<T>(ctx.GetPlace()); cast_scale.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type()); auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_scale = const auto& runner_cast_scale =
NpuOpRunner("Cast", {*scale}, {cast_scale}, NpuOpRunner("Cast", {*scale}, {cast_scale},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_scale.Run(stream); runner_cast_scale.Run(stream);
...@@ -125,7 +125,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -125,7 +125,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
cast_bias.Resize(bias->dims()); cast_bias.Resize(bias->dims());
cast_bias.mutable_data<T>(ctx.GetPlace()); cast_bias.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type()); auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_bias = const auto& runner_cast_bias =
NpuOpRunner("Cast", {*bias}, {cast_bias}, NpuOpRunner("Cast", {*bias}, {cast_bias},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_bias.Run(stream); runner_cast_bias.Run(stream);
...@@ -163,18 +163,18 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -163,18 +163,18 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
variance->mutable_data<T>(ctx.GetPlace()); variance->mutable_data<T>(ctx.GetPlace());
} }
auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias}, const auto& runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
{*y, *tmp_mean, *tmp_variance}, {*y, *tmp_mean, *tmp_variance},
{{"begin_norm_axis", begin_norm_axis}, {{"begin_norm_axis", begin_norm_axis},
{"begin_params_axis", begin_norm_axis}, {"begin_params_axis", begin_norm_axis},
{"epsilon", epsilon}}); {"epsilon", epsilon}});
runner.Run(stream); runner.Run(stream);
// cast back from FP16 to FP32 // cast back from FP16 to FP32
if (x->type() == framework::proto::VarType::FP16 && if (x->type() == framework::proto::VarType::FP16 &&
mean->type() == framework::proto::VarType::FP32) { mean->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(mean->type()); auto dst_dtype = ConvertToNpuDtype(mean->type());
auto runner_cast_mean = const auto& runner_cast_mean =
NpuOpRunner("Cast", {*tmp_mean}, {*mean}, NpuOpRunner("Cast", {*tmp_mean}, {*mean},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_mean.Run(stream); runner_cast_mean.Run(stream);
...@@ -183,7 +183,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> { ...@@ -183,7 +183,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
if (x->type() == framework::proto::VarType::FP16 && if (x->type() == framework::proto::VarType::FP16 &&
variance->type() == framework::proto::VarType::FP32) { variance->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(variance->type()); auto dst_dtype = ConvertToNpuDtype(variance->type());
auto runner_cast_variance = const auto& runner_cast_variance =
NpuOpRunner("Cast", {*tmp_variance}, {*variance}, NpuOpRunner("Cast", {*tmp_variance}, {*variance},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_variance.Run(stream); runner_cast_variance.Run(stream);
...@@ -250,7 +250,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -250,7 +250,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
Tensor value(x->type()); Tensor value(x->type());
value.mutable_data<T>({1}, place); value.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0)); FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
auto runner = const auto& runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream); runner.Run(stream);
scale = &default_scale; scale = &default_scale;
...@@ -265,7 +265,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -265,7 +265,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
cast_scale.Resize(scale->dims()); cast_scale.Resize(scale->dims());
cast_scale.mutable_data<T>(ctx.GetPlace()); cast_scale.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type()); auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_scale = const auto& runner_cast_scale =
NpuOpRunner("Cast", {*scale}, {cast_scale}, NpuOpRunner("Cast", {*scale}, {cast_scale},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_scale.Run(stream); runner_cast_scale.Run(stream);
...@@ -280,7 +280,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -280,7 +280,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
cast_mean.Resize(mean->dims()); cast_mean.Resize(mean->dims());
cast_mean.mutable_data<T>(ctx.GetPlace()); cast_mean.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type()); auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_mean = const auto& runner_cast_mean =
NpuOpRunner("Cast", {*mean}, {cast_mean}, NpuOpRunner("Cast", {*mean}, {cast_mean},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_mean.Run(stream); runner_cast_mean.Run(stream);
...@@ -295,7 +295,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -295,7 +295,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
cast_variance.Resize(variance->dims()); cast_variance.Resize(variance->dims());
cast_variance.mutable_data<T>(ctx.GetPlace()); cast_variance.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type()); auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_variance = const auto& runner_cast_variance =
NpuOpRunner("Cast", {*variance}, {cast_variance}, NpuOpRunner("Cast", {*variance}, {cast_variance},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_variance.Run(stream); runner_cast_variance.Run(stream);
...@@ -343,16 +343,16 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -343,16 +343,16 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
dbias->mutable_data<T>(ctx.GetPlace()); dbias->mutable_data<T>(ctx.GetPlace());
} }
auto runner = NpuOpRunner("LayerNormGrad", const auto& runner = NpuOpRunner(
{*dy, *x, cast_variance, cast_mean, cast_scale}, "LayerNormGrad", {*dy, *x, cast_variance, cast_mean, cast_scale},
{*dx, *tmp_dscale, *tmp_dbias}, {}); {*dx, *tmp_dscale, *tmp_dbias}, {});
runner.Run(stream); runner.Run(stream);
// cast back from FP16 to FP32 // cast back from FP16 to FP32
if (x->type() == framework::proto::VarType::FP16 && if (x->type() == framework::proto::VarType::FP16 &&
dscale->type() == framework::proto::VarType::FP32) { dscale->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(dscale->type()); auto dst_dtype = ConvertToNpuDtype(dscale->type());
auto runner_cast_dscale = const auto& runner_cast_dscale =
NpuOpRunner("Cast", {*tmp_dscale}, {*dscale}, NpuOpRunner("Cast", {*tmp_dscale}, {*dscale},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_dscale.Run(stream); runner_cast_dscale.Run(stream);
...@@ -361,7 +361,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> { ...@@ -361,7 +361,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
if (x->type() == framework::proto::VarType::FP16 && if (x->type() == framework::proto::VarType::FP16 &&
dbias->type() == framework::proto::VarType::FP32) { dbias->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(dbias->type()); auto dst_dtype = ConvertToNpuDtype(dbias->type());
auto runner_cast_dbias = const auto& runner_cast_dbias =
NpuOpRunner("Cast", {*tmp_dbias}, {*dbias}, NpuOpRunner("Cast", {*tmp_dbias}, {*dbias},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_dbias.Run(stream); runner_cast_dbias.Run(stream);
......
...@@ -41,7 +41,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> { ...@@ -41,7 +41,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
output_t->mutable_data<T>(ctx.GetPlace()); output_t->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {{"validate_indices", false}}; framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
auto runner = const auto &runner =
NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input); NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -65,14 +65,14 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> { ...@@ -65,14 +65,14 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner_zeros = const auto &runner_zeros =
NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
runner_zeros.Run(stream); runner_zeros.Run(stream);
// NOTE(zhiqiu): It seems in cann 20.1, the first input and output // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
// can be different tensor, but in cann 20.2+, it does inplace operation. // can be different tensor, but in cann 20.2+, it does inplace operation.
// Thus, the first input and output should be same tensor. // Thus, the first input and output should be same tensor.
auto runner_scatter = const auto &runner_scatter =
NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
{*table_grad_t}, {{"use_locking", true}}); {*table_grad_t}, {{"use_locking", true}});
runner_scatter.Run(stream); runner_scatter.Run(stream);
......
...@@ -34,7 +34,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> { ...@@ -34,7 +34,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
if (x->dims().size() == 2) { if (x->dims().size() == 2) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner( const auto& runner = NpuOpRunner(
"MatMul", {*x, *y}, {*out}, "MatMul", {*x, *y}, {*out},
{{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
...@@ -46,7 +46,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> { ...@@ -46,7 +46,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
} else if (x->dims().size() > 2) { } else if (x->dims().size() > 2) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = const auto& runner =
NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
{{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
...@@ -76,7 +76,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> { ...@@ -76,7 +76,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
if (transpose_y) { if (transpose_y) {
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = const auto& runner_dx =
NpuOpRunner("MatMul", {*dout, *y}, {*dx}, NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", false}}); {{"transpose_x1", false}, {"transpose_x2", false}});
...@@ -84,7 +84,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> { ...@@ -84,7 +84,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
} }
if (dy) { if (dy) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = const auto& runner_dy =
NpuOpRunner("MatMul", {*dout, *x}, {*dy}, NpuOpRunner("MatMul", {*dout, *x}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}}); {{"transpose_x1", true}, {"transpose_x2", false}});
...@@ -94,7 +94,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> { ...@@ -94,7 +94,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
} else { } else {
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = const auto& runner_dx =
NpuOpRunner("MatMul", {*dout, *y}, {*dx}, NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}}); {{"transpose_x1", false}, {"transpose_x2", true}});
...@@ -102,7 +102,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> { ...@@ -102,7 +102,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
} }
if (dy) { if (dy) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = const auto& runner_dy =
NpuOpRunner("MatMul", {*x, *dout}, {*dy}, NpuOpRunner("MatMul", {*x, *dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}}); {{"transpose_x1", true}, {"transpose_x2", false}});
...@@ -113,30 +113,34 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> { ...@@ -113,30 +113,34 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
if (transpose_y) { if (transpose_y) {
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, const auto& runner_dx =
{{"adj_x1", false}, {"adj_x2", false}}); NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
{{"adj_x1", false}, {"adj_x2", false}});
runner_dx.Run(stream); runner_dx.Run(stream);
} }
if (dy) { if (dy) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, const auto& runner_dy =
{{"adj_x1", true}, {"adj_x2", false}}); NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
{{"adj_x1", true}, {"adj_x2", false}});
runner_dy.Run(stream); runner_dy.Run(stream);
} }
} else { } else {
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, const auto& runner_dx =
{{"adj_x1", false}, {"adj_x2", true}}); NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
{{"adj_x1", false}, {"adj_x2", true}});
runner_dx.Run(stream); runner_dx.Run(stream);
} }
if (dy) { if (dy) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, const auto& runner_dy =
{{"adj_x1", true}, {"adj_x2", false}}); NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
{{"adj_x1", true}, {"adj_x2", false}});
runner_dy.Run(stream); runner_dy.Run(stream);
} }
} }
......
...@@ -30,7 +30,7 @@ class MeanNPUKernel : public framework::OpKernel<T> { ...@@ -30,7 +30,7 @@ class MeanNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input); const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -61,7 +61,7 @@ class MeanGradNPUKernel : public framework::OpKernel<T> { ...@@ -61,7 +61,7 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
// ones // ones
Tensor ones(grad->type()); Tensor ones(grad->type());
ones.mutable_data<T>(IG->dims(), context.GetPlace()); ones.mutable_data<T>(IG->dims(), context.GetPlace());
auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
runner_ones.Run(stream); runner_ones.Run(stream);
// means // means
...@@ -75,11 +75,12 @@ class MeanGradNPUKernel : public framework::OpKernel<T> { ...@@ -75,11 +75,12 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
Tensor mean_ma(grad->type()); Tensor mean_ma(grad->type());
mean_ma.Resize(IG->dims()); mean_ma.Resize(IG->dims());
mean_ma.mutable_data<T>(context.GetPlace()); mean_ma.mutable_data<T>(context.GetPlace());
auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); const auto& runner_mul_1 =
NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
runner_mul_1.Run(stream); runner_mul_1.Run(stream);
// and mul grad // and mul grad
auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
runner_mul_2.Run(stream); runner_mul_2.Run(stream);
} }
}; };
......
...@@ -47,7 +47,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -47,7 +47,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
if (indices->type() != framework::proto::VarType::INT32) { if (indices->type() != framework::proto::VarType::INT32) {
cast_indices.Resize(indices->dims()); cast_indices.Resize(indices->dims());
cast_indices.mutable_data<int>(ctx.GetPlace()); cast_indices.mutable_data<int>(ctx.GetPlace());
auto runner_cast_indices = const auto& runner_cast_indices =
NpuOpRunner("Cast", {*indices}, {cast_indices}, NpuOpRunner("Cast", {*indices}, {cast_indices},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_indices.Run(stream); runner_cast_indices.Run(stream);
...@@ -57,7 +57,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -57,7 +57,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
if (label->type() != framework::proto::VarType::INT32) { if (label->type() != framework::proto::VarType::INT32) {
cast_label.Resize(label->dims()); cast_label.Resize(label->dims());
cast_label.mutable_data<int>(ctx.GetPlace()); cast_label.mutable_data<int>(ctx.GetPlace());
auto runner_cast_label = const auto& runner_cast_label =
NpuOpRunner("Cast", {*label}, {cast_label}, NpuOpRunner("Cast", {*label}, {cast_label},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream); runner_cast_label.Run(stream);
...@@ -73,7 +73,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -73,7 +73,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_equal(framework::proto::VarType::BOOL); Tensor tmp_equal(framework::proto::VarType::BOOL);
tmp_equal.Resize(inference->dims()); tmp_equal.Resize(inference->dims());
tmp_equal.mutable_data<bool>(ctx.GetPlace()); tmp_equal.mutable_data<bool>(ctx.GetPlace());
auto runner_equal = const auto& runner_equal =
NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {}); NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
runner_equal.Run(stream); runner_equal.Run(stream);
...@@ -81,7 +81,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -81,7 +81,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_equal_cast(framework::proto::VarType::FP32); Tensor tmp_equal_cast(framework::proto::VarType::FP32);
tmp_equal_cast.Resize(inference->dims()); tmp_equal_cast.Resize(inference->dims());
tmp_equal_cast.mutable_data<float>(ctx.GetPlace()); tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
auto runner_cast_equal = NpuOpRunner( const auto& runner_cast_equal = NpuOpRunner(
"Cast", {tmp_equal}, {tmp_equal_cast}, "Cast", {tmp_equal}, {tmp_equal_cast},
{{"dst_type", {{"dst_type",
static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}}); static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}});
...@@ -92,7 +92,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -92,7 +92,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_correct_max(framework::proto::VarType::FP32); Tensor tmp_correct_max(framework::proto::VarType::FP32);
tmp_correct_max.Resize(framework::make_ddim({num_samples})); tmp_correct_max.Resize(framework::make_ddim({num_samples}));
tmp_correct_max.mutable_data<float>(ctx.GetPlace()); tmp_correct_max.mutable_data<float>(ctx.GetPlace());
auto runner_reduce_max = const auto& runner_reduce_max =
NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max}, NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max},
{{"axes", std::vector<int>{1}}, {"keep_dims", false}}); {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
runner_reduce_max.Run(stream); runner_reduce_max.Run(stream);
...@@ -101,14 +101,14 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -101,14 +101,14 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_correct(framework::proto::VarType::FP32); Tensor tmp_correct(framework::proto::VarType::FP32);
tmp_correct.Resize(correct->dims()); tmp_correct.Resize(correct->dims());
tmp_correct.mutable_data<float>(ctx.GetPlace()); tmp_correct.mutable_data<float>(ctx.GetPlace());
auto runner_reduce_sum = const auto& runner_reduce_sum =
NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct}, NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct},
{{"axes", std::vector<int>{0}}, {"keep_dims", false}}); {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
runner_reduce_sum.Run(stream); runner_reduce_sum.Run(stream);
// cast to int // cast to int
correct->mutable_data<int>(ctx.GetPlace()); correct->mutable_data<int>(ctx.GetPlace());
auto runner_cast_correct = NpuOpRunner( const auto& runner_cast_correct = NpuOpRunner(
"Cast", {tmp_correct}, {*correct}, "Cast", {tmp_correct}, {*correct},
{{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}}); {{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}});
runner_cast_correct.Run(stream); runner_cast_correct.Run(stream);
...@@ -126,7 +126,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> { ...@@ -126,7 +126,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
// [accuracy] // [accuracy]
accuracy->mutable_data<float>(ctx.GetPlace()); accuracy->mutable_data<float>(ctx.GetPlace());
auto runner_accuracy = const auto& runner_accuracy =
NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {}); NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
runner_accuracy.Run(stream); runner_accuracy.Run(stream);
} }
......
...@@ -36,7 +36,7 @@ class MulNPUKernel : public framework::OpKernel<T> { ...@@ -36,7 +36,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
if (x_num_col_dims == 1 && y_num_col_dims == 1) { if (x_num_col_dims == 1 && y_num_col_dims == 1) {
if (x->dims().size() == 2 && y->dims().size() == 2) { if (x->dims().size() == 2 && y->dims().size() == 2) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = const auto& runner =
NpuOpRunner("MatMul", {*x, *y}, {*out}, NpuOpRunner("MatMul", {*x, *y}, {*out},
{{"transpose_x1", false}, {"transpose_x2", false}}); {{"transpose_x1", false}, {"transpose_x2", false}});
...@@ -54,7 +54,7 @@ class MulNPUKernel : public framework::OpKernel<T> { ...@@ -54,7 +54,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
// matmul // matmul
auto runner = const auto& runner =
NpuOpRunner("MatMul", {tmp_x, *y}, {*out}, NpuOpRunner("MatMul", {tmp_x, *y}, {*out},
{{"transpose_x1", false}, {"transpose_x2", false}}); {{"transpose_x1", false}, {"transpose_x2", false}});
runner.Run(stream); runner.Run(stream);
...@@ -85,7 +85,7 @@ class MulNPUKernel : public framework::OpKernel<T> { ...@@ -85,7 +85,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]})); tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]}));
tmp_matmul.mutable_data<T>(ctx.GetPlace()); tmp_matmul.mutable_data<T>(ctx.GetPlace());
auto runner_matmul = const auto& runner_matmul =
NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul}, NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul},
{{"transpose_x1", false}, {"transpose_x2", false}}); {{"transpose_x1", false}, {"transpose_x2", false}});
...@@ -121,7 +121,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> { ...@@ -121,7 +121,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
if (x->dims().size() == 2 && y->dims().size() == 2) { if (x->dims().size() == 2 && y->dims().size() == 2) {
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = const auto& runner_dx =
NpuOpRunner("MatMul", {*dout, *y}, {*dx}, NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}}); {{"transpose_x1", false}, {"transpose_x2", true}});
...@@ -130,7 +130,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> { ...@@ -130,7 +130,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
if (dy) { if (dy) {
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = const auto& runner_dy =
NpuOpRunner("MatMul", {*x, *dout}, {*dy}, NpuOpRunner("MatMul", {*x, *dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}}); {{"transpose_x1", true}, {"transpose_x2", false}});
...@@ -144,7 +144,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> { ...@@ -144,7 +144,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto dx_dims = dx->dims(); auto dx_dims = dx->dims();
dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]})); dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]}));
auto runner_matmul = const auto& runner_matmul =
NpuOpRunner("MatMul", {*dout, *y}, {*dx}, NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}}); {{"transpose_x1", false}, {"transpose_x2", true}});
runner_matmul.Run(stream); runner_matmul.Run(stream);
...@@ -164,7 +164,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> { ...@@ -164,7 +164,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<platform::DeviceContext>(), &tmp_x); ctx.template device_context<platform::DeviceContext>(), &tmp_x);
tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = const auto& runner_dy =
NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy}, NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}}); {{"transpose_x1", true}, {"transpose_x2", false}});
...@@ -193,7 +193,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> { ...@@ -193,7 +193,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
auto dx_dims = dx->dims(); auto dx_dims = dx->dims();
dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]})); dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]}));
auto runner_matmul = const auto& runner_matmul =
NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx}, NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}}); {{"transpose_x1", false}, {"transpose_x2", true}});
runner_matmul.Run(stream); runner_matmul.Run(stream);
...@@ -213,7 +213,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> { ...@@ -213,7 +213,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
// mamtul [6,4] [6,5] =>[4,5] // mamtul [6,4] [6,5] =>[4,5]
dy->mutable_data<T>(ctx.GetPlace()); dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = const auto& runner_dy =
NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy}, NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}}); {{"transpose_x1", true}, {"transpose_x2", false}});
runner_dy.Run(stream); runner_dy.Run(stream);
......
...@@ -89,7 +89,21 @@ NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs, ...@@ -89,7 +89,21 @@ NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
} }
NpuOpRunner::~NpuOpRunner() { NpuOpRunner::~NpuOpRunner() {
// TODO(zhiqiu): handle free VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_;
// Is it safe to free the descs/buffers after run called in host ?
aclopDestroyAttr(attr_); // return void
for (auto desc : input_descs_) {
aclDestroyTensorDesc(desc);
}
for (auto desc : output_descs_) {
aclDestroyTensorDesc(desc);
}
for (auto buffer : input_buffers_) {
PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
}
for (auto buffer : output_buffers_) {
PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
}
} }
const std::string &NpuOpRunner::Type() { return op_type_; } const std::string &NpuOpRunner::Type() { return op_type_; }
...@@ -186,6 +200,8 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { ...@@ -186,6 +200,8 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
} }
NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) { NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
input_descs_.reserve(tensors.size());
input_buffers_.reserve(tensors.size());
for (auto tensor : tensors) { for (auto tensor : tensors) {
// create aclTensorDesc // create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(tensor)); input_descs_.emplace_back(CreateTensorDesc(tensor));
...@@ -211,6 +227,8 @@ NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) { ...@@ -211,6 +227,8 @@ NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
} }
NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) { NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
output_descs_.reserve(tensors.size());
output_buffers_.reserve(tensors.size());
for (auto tensor : tensors) { for (auto tensor : tensors) {
// create aclTensorDesc // create aclTensorDesc
output_descs_.emplace_back(CreateTensorDesc(tensor)); output_descs_.emplace_back(CreateTensorDesc(tensor));
...@@ -281,12 +299,12 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { ...@@ -281,12 +299,12 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
return buffer; return buffer;
} }
void NpuOpRunner::Run(aclrtStream stream) { void NpuOpRunner::Run(aclrtStream stream) const {
if (!stream) { if (!stream) {
VLOG(4) << "Run with default current npu stream: " << stream; VLOG(4) << "Run with default current npu stream: " << stream;
stream = GetCurrentNPUStream(); stream = GetCurrentNPUStream();
} }
VLOG(5) << "NpuOpRunner(" << this << ") Run:";
VLOG(4) << "op_type: " << op_type_; VLOG(4) << "op_type: " << op_type_;
VLOG(4) << "input_desc.size: " << input_descs_.size(); VLOG(4) << "input_desc.size: " << input_descs_.size();
VLOG(4) << "output_desc.size: " << output_descs_.size(); VLOG(4) << "output_desc.size: " << output_descs_.size();
......
...@@ -41,6 +41,14 @@ class NpuOpRunner { ...@@ -41,6 +41,14 @@ class NpuOpRunner {
const std::vector<Tensor> &outputs = {}, const std::vector<Tensor> &outputs = {},
const NPUAttributeMap &attrs = {}); const NPUAttributeMap &attrs = {});
// NOTE(zhiqiu): why forbid copy and operator= ?
// Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
// if shallow copy is performed on tensor_descs and data_buffers, it may
// result
// in use-after-free bugs.
NpuOpRunner(const NpuOpRunner &runner) = delete;
NpuOpRunner &operator=(const NpuOpRunner &runner) = delete;
~NpuOpRunner(); ~NpuOpRunner();
const std::string &Type(); const std::string &Type();
...@@ -71,7 +79,7 @@ class NpuOpRunner { ...@@ -71,7 +79,7 @@ class NpuOpRunner {
std::vector<aclDataBuffer *> &GetOutputBuffers(); std::vector<aclDataBuffer *> &GetOutputBuffers();
void Run(aclrtStream stream = nullptr); void Run(aclrtStream stream = nullptr) const;
private: private:
aclTensorDesc *CreateTensorDesc(Tensor tensor); aclTensorDesc *CreateTensorDesc(Tensor tensor);
......
...@@ -147,7 +147,7 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -147,7 +147,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = const auto& runner =
NpuOpRunner("ApplyAdamD", NpuOpRunner("ApplyAdamD",
{ {
*param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr, *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
...@@ -179,10 +179,10 @@ class AdamNPUKernel : public framework::OpKernel<T> { ...@@ -179,10 +179,10 @@ class AdamNPUKernel : public framework::OpKernel<T> {
if (!use_global_beta_pow) { if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace()); beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace()); beta2_pow_out->mutable_data<T>(ctx.GetPlace());
auto runner_m1 = const auto& runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {}); NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream); runner_m1.Run(stream);
auto runner_m2 = const auto& runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {}); NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream); runner_m2.Run(stream);
} }
......
...@@ -32,7 +32,7 @@ class SGDNPUKernel : public framework::OpKernel<T> { ...@@ -32,7 +32,7 @@ class SGDNPUKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
auto runner = const auto& runner =
NpuOpRunner("ApplyGradientDescent", NpuOpRunner("ApplyGradientDescent",
{*param_var, *learning_rate, *grad_var}, {*param_out}, {}); {*param_var, *learning_rate, *grad_var}, {*param_out}, {});
......
...@@ -38,7 +38,7 @@ class ReduceAnyNPUKernel : public framework::OpKernel<T> { ...@@ -38,7 +38,7 @@ class ReduceAnyNPUKernel : public framework::OpKernel<T> {
// set attr // set attr
NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}}; NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr); const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -51,7 +51,7 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
cast_x.Resize(x->dims()); cast_x.Resize(x->dims());
cast_x.mutable_data<float>(ctx.GetPlace()); cast_x.mutable_data<float>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32); auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
auto runner_cast = NpuOpRunner( const auto& runner_cast = NpuOpRunner(
"Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}}); "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream); runner_cast.Run(stream);
...@@ -68,20 +68,22 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> { ...@@ -68,20 +68,22 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
dim_vec.push_back(i); dim_vec.push_back(i);
} }
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, const auto& runner =
{{"axes", dim_vec}, {"keep_dims", keep_dims}}); NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
runner.Run(stream); runner.Run(stream);
} else { } else {
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, const auto& runner =
{{"axes", dims}, {"keep_dims", keep_dims}}); NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
runner.Run(stream); runner.Run(stream);
} }
if (x->type() != framework::proto::VarType::FP32 && if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) { x->type() != framework::proto::VarType::FP16) {
auto dst_dtype = ConvertToNpuDtype(out->type()); auto dst_dtype = ConvertToNpuDtype(out->type());
auto runner_cast = const auto& runner_cast =
NpuOpRunner("Cast", {cast_out}, {*out}, NpuOpRunner("Cast", {cast_out}, {*out},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream); runner_cast.Run(stream);
...@@ -107,8 +109,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> { ...@@ -107,8 +109,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
if (keep_dims || reduce_all) { if (keep_dims || reduce_all) {
auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad}, const auto& runner =
{{"shape", framework::vectorize(x->dims())}}); NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
{{"shape", framework::vectorize(x->dims())}});
runner.Run(stream); runner.Run(stream);
} else { } else {
framework::DDim out_dims; framework::DDim out_dims;
...@@ -124,8 +127,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> { ...@@ -124,8 +127,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
&out_grad_tmp); &out_grad_tmp);
out_grad_tmp.Resize(out_dims); out_grad_tmp.Resize(out_dims);
auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad}, const auto& runner =
{{"shape", framework::vectorize(x->dims())}}); NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
{{"shape", framework::vectorize(x->dims())}});
runner.Run(stream); runner.Run(stream);
} }
} }
......
...@@ -38,7 +38,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> { ...@@ -38,7 +38,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
<< " ,bias_after_scale:" << bias_after_scale; << " ,bias_after_scale:" << bias_after_scale;
if (bias_after_scale) { if (bias_after_scale) {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = const auto& runner =
NpuOpRunner("Power", {*x}, {*out}, NpuOpRunner("Power", {*x}, {*out},
{{"power", _power}, {"scale", scale}, {"shift", bias}}); {{"power", _power}, {"scale", scale}, {"shift", bias}});
...@@ -47,12 +47,13 @@ class ScaleNPUKernel : public framework::OpKernel<T> { ...@@ -47,12 +47,13 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
Tensor tmp_x(x->type()); Tensor tmp_x(x->type());
tmp_x.Resize(x->dims()); tmp_x.Resize(x->dims());
tmp_x.mutable_data<T>(ctx.GetPlace()); tmp_x.mutable_data<T>(ctx.GetPlace());
auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}}); const auto& runner_tmp =
NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
runner_tmp.Run(stream); runner_tmp.Run(stream);
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
float _bias = 0.0; float _bias = 0.0;
auto runner = const auto& runner =
NpuOpRunner("Power", {tmp_x}, {*out}, NpuOpRunner("Power", {tmp_x}, {*out},
{{"power", _power}, {"scale", scale}, {"shift", _bias}}); {{"power", _power}, {"scale", scale}, {"shift", _bias}});
runner.Run(stream); runner.Run(stream);
......
...@@ -53,11 +53,11 @@ class ScatterNPUKernel : public framework::OpKernel<T> { ...@@ -53,11 +53,11 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
.stream(); .stream();
if (overwrite) { if (overwrite) {
auto runner_update = NpuOpRunner("TensorScatterUpdate", const auto& runner_update = NpuOpRunner(
{*x, *index, *updates}, {*out}, {}); "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
runner_update.Run(stream); runner_update.Run(stream);
} else { } else {
auto runner_add = const auto& runner_add =
NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {}); NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
runner_add.Run(stream); runner_add.Run(stream);
} }
......
...@@ -72,8 +72,8 @@ class SliceNPUKernel : public framework::OpKernel<T> { ...@@ -72,8 +72,8 @@ class SliceNPUKernel : public framework::OpKernel<T> {
UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
auto runner = NpuOpRunner("SliceD", {*input}, {*out}, const auto& runner = NpuOpRunner("SliceD", {*input}, {*out},
{{"offsets", offsets}, {"size", size}}); {{"offsets", offsets}, {"size", size}});
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -111,7 +111,7 @@ class SliceGradNPUKernel : public framework::OpKernel<T> { ...@@ -111,7 +111,7 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = const auto& runner =
NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}}); NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
runner.Run(stream); runner.Run(stream);
} }
......
...@@ -31,7 +31,7 @@ class SoftmaxNPUKernel : public framework::OpKernel<T> { ...@@ -31,7 +31,7 @@ class SoftmaxNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input); const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
...@@ -71,8 +71,8 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> { ...@@ -71,8 +71,8 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
dX->mutable_data<T>(ctx.GetPlace()); dX->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {}; framework::NPUAttributeMap attr_input = {};
auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, const auto& runner = NpuOpRunner(std::string("SoftmaxGrad"),
{*dX}, attr_input); {tmp_out, tmp_dOut}, {*dX}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
......
...@@ -47,7 +47,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> { ...@@ -47,7 +47,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
// softmax // softmax
softmax->mutable_data<T>(ctx.GetPlace()); softmax->mutable_data<T>(ctx.GetPlace());
auto runner_softmax = const auto& runner_softmax =
NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}}); NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
runner_softmax.Run(stream); runner_softmax.Run(stream);
...@@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> { ...@@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
tmp_labels.Resize(labels->dims()); tmp_labels.Resize(labels->dims());
tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32); tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
auto runner_cast_label = const auto& runner_cast_label =
NpuOpRunner("Cast", {*labels}, {tmp_labels}, NpuOpRunner("Cast", {*labels}, {tmp_labels},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream); runner_cast_label.Run(stream);
...@@ -77,7 +77,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> { ...@@ -77,7 +77,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
tmp_onehot.Resize(logits->dims()); tmp_onehot.Resize(logits->dims());
tmp_onehot.mutable_data<int>(ctx.GetPlace()); tmp_onehot.mutable_data<int>(ctx.GetPlace());
auto runner_onehot = const auto& runner_onehot =
NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot}, NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
{{"axis", -1}, {"depth", cls_num}}); {{"axis", -1}, {"depth", cls_num}});
runner_onehot.Run(stream); runner_onehot.Run(stream);
...@@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> { ...@@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
cast_onehot.Resize(tmp_onehot.dims()); cast_onehot.Resize(tmp_onehot.dims());
cast_onehot.mutable_data<T>(ctx.GetPlace()); cast_onehot.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(logits->type()); auto dst_dtype = ConvertToNpuDtype(logits->type());
auto runner_cast_onehot = const auto& runner_cast_onehot =
NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot}, NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_onehot.Run(stream); runner_cast_onehot.Run(stream);
...@@ -102,8 +102,9 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> { ...@@ -102,8 +102,9 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
// SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size] // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size]
auto loss_dims = loss->dims(); auto loss_dims = loss->dims();
loss->Resize({loss_dims[0]}); loss->Resize({loss_dims[0]});
auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits", const auto& runner_s =
{*logits, cast_onehot}, {*loss, backprop}, {}); NpuOpRunner("SoftmaxCrossEntropyWithLogits", {*logits, cast_onehot},
{*loss, backprop}, {});
runner_s.Run(stream); runner_s.Run(stream);
loss->Resize(loss_dims); loss->Resize(loss_dims);
} }
...@@ -130,7 +131,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> { ...@@ -130,7 +131,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
tmp_labels.Resize(labels->dims()); tmp_labels.Resize(labels->dims());
tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32); tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
auto runner_cast_label = const auto& runner_cast_label =
NpuOpRunner("Cast", {*labels}, {tmp_labels}, NpuOpRunner("Cast", {*labels}, {tmp_labels},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream); runner_cast_label.Run(stream);
...@@ -150,7 +151,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> { ...@@ -150,7 +151,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
tmp_onehot.Resize(softmax->dims()); tmp_onehot.Resize(softmax->dims());
tmp_onehot.mutable_data<int>(ctx.GetPlace()); tmp_onehot.mutable_data<int>(ctx.GetPlace());
auto runner_onehot = const auto& runner_onehot =
NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot}, NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
{{"axis", -1}, {"depth", cls_num}}); {{"axis", -1}, {"depth", cls_num}});
runner_onehot.Run(stream); runner_onehot.Run(stream);
...@@ -160,7 +161,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> { ...@@ -160,7 +161,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
cast_onehot.Resize(tmp_onehot.dims()); cast_onehot.Resize(tmp_onehot.dims());
cast_onehot.mutable_data<T>(ctx.GetPlace()); cast_onehot.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(softmax->type()); auto dst_dtype = ConvertToNpuDtype(softmax->type());
auto runner_cast_onehot = const auto& runner_cast_onehot =
NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot}, NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_onehot.Run(stream); runner_cast_onehot.Run(stream);
...@@ -169,13 +170,13 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> { ...@@ -169,13 +170,13 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
Tensor tmp_sub(softmax->type()); Tensor tmp_sub(softmax->type());
tmp_sub.Resize(softmax->dims()); tmp_sub.Resize(softmax->dims());
tmp_sub.mutable_data<T>(ctx.GetPlace()); tmp_sub.mutable_data<T>(ctx.GetPlace());
auto runner_sub = const auto& runner_sub =
NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {}); NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {});
runner_sub.Run(stream); runner_sub.Run(stream);
// mul // mul
logits_grad->mutable_data<T>(ctx.GetPlace()); logits_grad->mutable_data<T>(ctx.GetPlace());
auto runner_mul = const auto& runner_mul =
NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {}); NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {});
runner_mul.Run(stream); runner_mul.Run(stream);
} }
......
...@@ -69,7 +69,7 @@ class StackNPUKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,7 @@ class StackNPUKernel : public framework::OpKernel<T> {
tmp_stack.Resize(framework::make_ddim(vec_dim_tmp)); tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
tmp_stack.mutable_data<T>(ctx.GetPlace()); tmp_stack.mutable_data<T>(ctx.GetPlace());
auto runner = const auto& runner =
NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}}); NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
runner.Run(stream); runner.Run(stream);
...@@ -81,12 +81,12 @@ class StackNPUKernel : public framework::OpKernel<T> { ...@@ -81,12 +81,12 @@ class StackNPUKernel : public framework::OpKernel<T> {
} }
} }
auto runner_trans_final = const auto& runner_trans_final =
NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}}); NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
runner_trans_final.Run(stream); runner_trans_final.Run(stream);
} else { } else {
auto runner = const auto& runner =
NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}}); NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
runner.Run(stream); runner.Run(stream);
} }
......
...@@ -43,12 +43,12 @@ class SumNPUKernel : public framework::OpKernel<T> { ...@@ -43,12 +43,12 @@ class SumNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {}); const auto& runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
runner.Run(stream); runner.Run(stream);
for (int i = 2; i < n; i++) { for (int i = 2; i < n; i++) {
runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {}); const auto& runner1 = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
runner.Run(stream); runner1.Run(stream);
} }
} }
}; };
......
...@@ -67,8 +67,8 @@ class TopkNPUKernel : public framework::OpKernel<T> { ...@@ -67,8 +67,8 @@ class TopkNPUKernel : public framework::OpKernel<T> {
tmp_indices.mutable_data<int>(ctx.GetPlace()); tmp_indices.mutable_data<int>(ctx.GetPlace());
// run ascend // run ascend
auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor}, const auto& runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
{*output, tmp_indices}, attr_input); {*output, tmp_indices}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
...@@ -76,7 +76,7 @@ class TopkNPUKernel : public framework::OpKernel<T> { ...@@ -76,7 +76,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
// cast indices from INT32 to INT64 // cast indices from INT32 to INT64
auto dst_dtype = ConvertToNpuDtype(indices->type()); auto dst_dtype = ConvertToNpuDtype(indices->type());
auto runner_cast_indices = const auto& runner_cast_indices =
NpuOpRunner("Cast", {tmp_indices}, {*indices}, NpuOpRunner("Cast", {tmp_indices}, {*indices},
{{"dst_type", static_cast<int>(dst_dtype)}}); {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_indices.Run(stream); runner_cast_indices.Run(stream);
......
...@@ -29,7 +29,7 @@ class TransposeNPUKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,7 @@ class TransposeNPUKernel : public framework::OpKernel<T> {
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis"); std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
framework::NPUAttributeMap attr_input = {{"perm", axis}}; framework::NPUAttributeMap attr_input = {{"perm", axis}};
out->mutable_data<T>(ctx.device_context().GetPlace()); out->mutable_data<T>(ctx.device_context().GetPlace());
auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
...@@ -52,7 +52,8 @@ class TransposeGradNPUKernel : public framework::OpKernel<T> { ...@@ -52,7 +52,8 @@ class TransposeGradNPUKernel : public framework::OpKernel<T> {
} }
x_grad->mutable_data<T>(ctx.GetPlace()); x_grad->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}}; framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); const auto& runner =
NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
......
...@@ -59,7 +59,7 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> { ...@@ -59,7 +59,7 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
auto stream = auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>() ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream(); .stream();
auto runner = NpuOpRunner( const auto& runner = NpuOpRunner(
"ParameterizedTruncatedNormal", "ParameterizedTruncatedNormal",
{shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out}, {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out},
{{"seed", seed_var}}); {{"seed", seed_var}});
......
...@@ -251,8 +251,8 @@ class TestNet(unittest.TestCase): ...@@ -251,8 +251,8 @@ class TestNet(unittest.TestCase):
cpu_pred, cpu_loss = self._test(False) cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True) npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4)) self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4)) self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
@unittest.skipIf(not paddle.is_compiled_with_npu(), @unittest.skipIf(not paddle.is_compiled_with_npu(),
...@@ -335,8 +335,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase): ...@@ -335,8 +335,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
cpu_pred, cpu_loss = self._test(False) cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True) npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4)) self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4)) self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册