未验证 提交 8259d9bf 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] refine NpuOpRunner (#32869)

* refine ~npuOpRunner

* implement destructor and forbid copy

* use reference to avoid copy

* use const reference

* relax adam precision

* fix top_k
上级 78ecb668
......@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Power", {*x}, {*out},
{{"power", factor},
{"scale", static_cast<float>(1.0)},
{"shift", static_cast<float>(0.0)}});
const auto& runner = NpuOpRunner("Power", {*x}, {*out},
{{"power", factor},
{"scale", static_cast<float>(1.0)},
{"shift", static_cast<float>(0.0)}});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// Step1: Compute x_pow = x.pow(factor-1)
Tensor x_pow(x->type());
x_pow.mutable_data<T>(x->dims(), place);
auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow},
{{"power", factor - static_cast<float>(1)}});
const auto& runner_pow = NpuOpRunner(
"Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
runner_pow.Run(stream);
// Step 2: Construct a broadcast factor, which has the same shape with x.
......@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// factor.
Tensor factor_bc_tensor(framework::proto::VarType::FP32);
factor_bc_tensor.mutable_data<float>(x_dims, place);
auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
{{"dims", framework::vectorize(x_dims)}});
const auto& runner_bc =
NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
{{"dims", framework::vectorize(x_dims)}});
runner_bc.Run(stream);
// Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
Tensor x_power_mul_factor(x->type());
x_power_mul_factor.mutable_data<T>(x->dims(), place);
auto runner_mul_1 =
const auto& runner_mul_1 =
NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
runner_mul_1.Run(stream);
// Step 4: Compute dx = dout * factor * x.pow(factor-1)
dx->mutable_data<T>(place);
auto runner_mul_2 =
const auto& runner_mul_2 =
NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
runner_mul_2.Run(stream);
}
......@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Relu",
{
*x,
},
{*out}, {});
const auto& runner = NpuOpRunner("Relu",
{
*x,
},
{*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> {
.stream();
dx->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
runner.Run(stream);
}
......@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
runner.Run(stream);
}
};
......@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
dx_runner.Run(stream);
const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
runner_dx.Run(stream);
}
};
......@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> {
Tensor one(x->type());
one.mutable_data<T>(x->dims(), place);
auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {});
one_runner.Run(stream);
const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
runner_one.Run(stream);
Tensor sub(x->type());
sub.mutable_data<T>(x->dims(), place);
auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {});
sub_runner.Run(stream);
const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
runner_sub.Run(stream);
auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {});
out_runner.Run(stream);
const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
runner_out.Run(stream);
}
};
......@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> {
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
runner.Run(stream);
}
};
......@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
runner.Run(stream);
}
};
......@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
dx_runner.Run(stream);
const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
runner_dx.Run(stream);
}
};
......@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Square", {*x}, {*out}, {});
const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
runner.Run(stream);
}
};
......
......@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
float_status->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
const auto& runner =
NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......
......@@ -58,7 +58,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
Tensor inverse_out(scale->type());
inverse_out.Resize(scale->dims());
inverse_out.mutable_data<T>(ctx.GetPlace());
auto runner_inverse =
const auto& runner_inverse =
NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
runner_inverse.Run(stream);
tmp_inverse_out = &inverse_out;
......@@ -69,14 +69,14 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder.
auto runner_float_status =
const auto& runner_float_status =
NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
{{"message", std::string("check_nan_and_inf")}});
runner_float_status.Run(stream);
Tensor sum;
sum.mutable_data<float>({1}, ctx.GetPlace());
auto runner_reduce_sum =
const auto& runner_reduce_sum =
NpuOpRunner("ReduceSumD", {*float_status}, {sum},
{{"axes", std::vector<int>{0}}, {"keep_dims", true}});
runner_reduce_sum.Run(stream);
......@@ -95,7 +95,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace());
if (!found_inf_data) {
// MatMul
auto runner_matmul =
const auto& runner_matmul =
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_matmul.Run(stream);
}
......@@ -114,7 +114,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
auto runner_clear_status =
const auto& runner_clear_status =
NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
runner_clear_status.Run(stream);
}
......
......@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx,
Tensor factor_tensor(bad_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
{*bad_out_tensor}, {});
const auto& runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
{*bad_out_tensor}, {});
runner_p2.Run(stream);
std::vector<int> bad_out_data;
TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
if (bad_out_data[0] == decr_every_n_nan_or_inf) {
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", decr_ratio},
{"shift", static_cast<float>(0)}});
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", decr_ratio},
{"shift", static_cast<float>(0)}});
runner_p3.Run(stream);
......@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx,
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (new_loss_scaling[0] < static_cast<T>(1)) {
// updated_loss_scaling_data = 1
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(1)}});
const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(1)}});
runner_p4.Run(stream);
}
......@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx,
Tensor factor_tensor(good_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
{*good_out_tensor}, {});
const auto& runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
{*good_out_tensor}, {});
runner_p2.Run(stream);
std::vector<int> good_out_data;
TensorToVector(*good_out_tensor, ctx, &good_out_data);
if (good_out_data[0] == incr_every_n_steps) {
auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", incr_ratio},
{"shift", static_cast<float>(0)}});
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", incr_ratio},
{"shift", static_cast<float>(0)}});
runner_p3.Run(stream);
std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (!std::isfinite(new_loss_scaling[0])) {
// updated_loss_scaling_data = pre_loss_scaling_data
auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(1)},
{"shift", static_cast<float>(0)}});
const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(1)},
{"shift", static_cast<float>(0)}});
runner_p4.Run(stream);
}
......@@ -165,7 +165,7 @@ class LazyZerosNPU {
}
zero_tensor->mutable_data<T>(place);
auto runner_zeros =
const auto& runner_zeros =
NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
runner_zeros.Run(stream);
zero_tensor->check_memory_size();
......
......@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......
......@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Cast", {*x}, {*out},
{{"dst_type", static_cast<int32_t>(aclDtype)}});
const auto& runner = NpuOpRunner(
"Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
runner.Run(stream);
}
};
......
......@@ -52,9 +52,11 @@ class ConcatNPUKernel : public framework::OpKernel<T> {
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner(
"ConcatD", {inputs}, {*out},
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}});
NpuOpRunner runner{
"ConcatD",
{inputs},
{*out},
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
runner.AddInputNames(names);
runner.Run(stream);
}
......@@ -101,8 +103,9 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
sizes.push_back(ins[j]->dims()[dim]);
}
}
auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
{{"offsets", offsets}, {"size", sizes}});
const auto& runner =
NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
{{"offsets", offsets}, {"size", sizes}});
runner.Run(stream);
}
if (ins[j]->numel() != 0UL) {
......
......@@ -34,7 +34,7 @@ class EqualNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......@@ -51,7 +51,7 @@ class LessThanNPUKernel : public framework::OpKernel<T> {
auto* z = ctx.Output<framework::LoDTensor>("Out");
// int axis = context.Attr<int>("axis");
z->mutable_data<bool>(ctx.GetPlace()); // allocate
auto runner = NpuOpRunner("Less", {*x, *y}, {*z});
const auto& runner = NpuOpRunner("Less", {*x, *y}, {*z});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......
......@@ -40,7 +40,7 @@ class LogicalNotNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
const auto& runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
runner.Run(stream);
}
};
......
......@@ -32,7 +32,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......@@ -82,8 +82,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
const auto& runner =
NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
......@@ -96,8 +97,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopy(
......@@ -123,8 +124,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
const auto& runner =
NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
......@@ -138,8 +140,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
}
if (axes.size() != 0) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
{{"axes", axes}, {"keep_dims", true}});
const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopy(
......
......@@ -40,7 +40,7 @@ class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
......@@ -65,46 +65,47 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
Tensor y_power(y->type());
y_power.mutable_data<T>(y->dims(), place);
auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power},
{{"power", static_cast<float>(-1)}});
y_power_runner.Run(stream);
const auto& runner_y_power = NpuOpRunner(
"Power", {*y}, {y_power}, {{"power", static_cast<float>(-1)}});
runner_y_power.Run(stream);
if (dx) {
dx->mutable_data<T>(place);
Tensor tensor_zeros(x->type());
tensor_zeros.mutable_data<T>(x->dims(), place);
auto tensor_zeros_runner =
const auto& runner_tensor_zeros =
NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
tensor_zeros_runner.Run(stream);
runner_tensor_zeros.Run(stream);
Tensor x_zero(paddle::framework::proto::VarType::BOOL);
x_zero.mutable_data<bool>(x->dims(), place);
auto x_zero_runner =
const auto& runner_x_zero =
NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
x_zero_runner.Run(stream);
runner_x_zero.Run(stream);
Tensor x_nozero(paddle::framework::proto::VarType::BOOL);
x_nozero.mutable_data<bool>(x->dims(), place);
auto x_nozero_runner =
const auto& runner_x_nonzero =
NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
x_nozero_runner.Run(stream);
runner_x_nonzero.Run(stream);
Tensor x_nozero_f(x->type());
x_nozero_f.mutable_data<T>(x->dims(), place);
auto x_nozero_f_runner =
const auto& runner_x_nonzero_f =
NpuOpRunner("Cast", {x_nozero}, {x_nozero_f},
{{"dst_type", static_cast<int32_t>(0)}});
x_nozero_f_runner.Run(stream);
runner_x_nonzero_f.Run(stream);
Tensor x_grad_w(x->type());
x_grad_w.mutable_data<T>(x->dims(), place);
auto x_grad_w_runner =
const auto& runner_x_grad_w =
NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {});
x_grad_w_runner.Run(stream);
runner_x_grad_w.Run(stream);
auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
x_grad_runner.Run(stream);
const auto& runner_x_grad =
NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
runner_x_grad.Run(stream);
}
if (dy) {
......@@ -112,16 +113,18 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
Tensor neg_out(y->type());
neg_out.mutable_data<T>(y->dims(), place);
auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {});
neg_out_runner.Run(stream);
const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
runner_neg_out.Run(stream);
Tensor y_grad_w(y->type());
y_grad_w.mutable_data<T>(y->dims(), place);
auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
y_grad_w_runner.Run(stream);
const auto& runner_y_grad_w =
NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
runner_y_grad_w.Run(stream);
auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
y_grad_runner.Run(stream);
const auto& runner_y_grad =
NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
runner_y_grad.Run(stream);
}
}
};
......
......@@ -37,7 +37,7 @@ class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
......
......@@ -40,7 +40,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
......
......@@ -40,7 +40,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
......
......@@ -41,7 +41,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
......@@ -65,14 +65,14 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
if (dx) {
dx->mutable_data<T>(place);
auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
dx_runner.Run(stream);
const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
runner_dx.Run(stream);
}
if (dy) {
dy->mutable_data<T>(place);
auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
dy_runner.Run(stream);
const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
runner_dy.Run(stream);
}
}
};
......
......@@ -40,7 +40,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
......
......@@ -33,7 +33,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -84,8 +84,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
const auto& runner =
NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
......@@ -98,8 +99,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopy(
......@@ -127,8 +128,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
const auto& runner =
NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
......@@ -144,14 +146,15 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
if (axes.size() != 0) {
reduced_dy.Resize(dy->dims());
reduced_dy.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
const auto& runner =
NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
tmp_dy = &reduced_dy;
}
// stage 3, negative
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream);
}
}
......
......@@ -11,7 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <iostream>
#include <memory>
#include <string>
......@@ -65,7 +64,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
out0->Resize(out_dims);
out0->mutable_data<T>(context.device_context().GetPlace());
auto runner =
const auto& runner =
NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -82,5 +81,3 @@ REGISTER_OP_NPU_KERNEL(
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
......@@ -68,8 +68,8 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
FillNpuTensorWithConstant<T>(&tensor_tmp, value);
out_var->mutable_data<T>(shape, place);
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
{{"dims", framework::vectorize(shape)}});
const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
{{"dims", framework::vectorize(shape)}});
runner.Run(stream);
}
};
......
......@@ -33,8 +33,8 @@ class GatherOpNPUKernel : public framework::OpKernel<T> {
auto *out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("Gather", {*x, *index}, {*out},
{{"validate_indices", true}});
const auto &runner = NpuOpRunner("Gather", {*x, *index}, {*out},
{{"validate_indices", true}});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......@@ -75,7 +75,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
zeroslike_xout.numel() * sizeof(T), stream);
// step3: scatter(x_grad)
auto runner_scatter = NpuOpRunner(
const auto &runner_scatter = NpuOpRunner(
"TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
runner_scatter.Run(stream);
}
......
......@@ -39,7 +39,7 @@ class GeluNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
runner.Run(stream);
}
};
......@@ -63,11 +63,12 @@ class GeluGradNPUKernel : public framework::OpKernel<T> {
Tensor out(x->type());
out.mutable_data<T>(x->dims(), place);
auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {});
out_runner.Run(stream);
const auto& runner_out = NpuOpRunner("Gelu", {*x}, {out}, {});
runner_out.Run(stream);
auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
dx_runner.Run(stream);
const auto& runner_dx =
NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
runner_dx.Run(stream);
}
};
......
......@@ -43,7 +43,7 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
step_tensor.mutable_data<T>({1}, context.GetPlace());
FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
auto runner =
const auto& runner =
NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
auto stream =
......
......@@ -81,7 +81,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
Tensor value(x->type());
value.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
auto runner =
const auto& runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream);
scale = &default_scale;
......@@ -95,7 +95,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
Tensor value(x->type());
value.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
auto runner =
const auto& runner =
NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
runner.Run(stream);
bias = &default_bias;
......@@ -110,7 +110,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
cast_scale.Resize(scale->dims());
cast_scale.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_scale =
const auto& runner_cast_scale =
NpuOpRunner("Cast", {*scale}, {cast_scale},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_scale.Run(stream);
......@@ -125,7 +125,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
cast_bias.Resize(bias->dims());
cast_bias.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_bias =
const auto& runner_cast_bias =
NpuOpRunner("Cast", {*bias}, {cast_bias},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_bias.Run(stream);
......@@ -163,18 +163,18 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
variance->mutable_data<T>(ctx.GetPlace());
}
auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
{*y, *tmp_mean, *tmp_variance},
{{"begin_norm_axis", begin_norm_axis},
{"begin_params_axis", begin_norm_axis},
{"epsilon", epsilon}});
const auto& runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
{*y, *tmp_mean, *tmp_variance},
{{"begin_norm_axis", begin_norm_axis},
{"begin_params_axis", begin_norm_axis},
{"epsilon", epsilon}});
runner.Run(stream);
// cast back from FP16 to FP32
if (x->type() == framework::proto::VarType::FP16 &&
mean->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(mean->type());
auto runner_cast_mean =
const auto& runner_cast_mean =
NpuOpRunner("Cast", {*tmp_mean}, {*mean},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_mean.Run(stream);
......@@ -183,7 +183,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
if (x->type() == framework::proto::VarType::FP16 &&
variance->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(variance->type());
auto runner_cast_variance =
const auto& runner_cast_variance =
NpuOpRunner("Cast", {*tmp_variance}, {*variance},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_variance.Run(stream);
......@@ -250,7 +250,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
Tensor value(x->type());
value.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
auto runner =
const auto& runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream);
scale = &default_scale;
......@@ -265,7 +265,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
cast_scale.Resize(scale->dims());
cast_scale.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_scale =
const auto& runner_cast_scale =
NpuOpRunner("Cast", {*scale}, {cast_scale},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_scale.Run(stream);
......@@ -280,7 +280,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
cast_mean.Resize(mean->dims());
cast_mean.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_mean =
const auto& runner_cast_mean =
NpuOpRunner("Cast", {*mean}, {cast_mean},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_mean.Run(stream);
......@@ -295,7 +295,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
cast_variance.Resize(variance->dims());
cast_variance.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(x->type());
auto runner_cast_variance =
const auto& runner_cast_variance =
NpuOpRunner("Cast", {*variance}, {cast_variance},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_variance.Run(stream);
......@@ -343,16 +343,16 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
dbias->mutable_data<T>(ctx.GetPlace());
}
auto runner = NpuOpRunner("LayerNormGrad",
{*dy, *x, cast_variance, cast_mean, cast_scale},
{*dx, *tmp_dscale, *tmp_dbias}, {});
const auto& runner = NpuOpRunner(
"LayerNormGrad", {*dy, *x, cast_variance, cast_mean, cast_scale},
{*dx, *tmp_dscale, *tmp_dbias}, {});
runner.Run(stream);
// cast back from FP16 to FP32
if (x->type() == framework::proto::VarType::FP16 &&
dscale->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(dscale->type());
auto runner_cast_dscale =
const auto& runner_cast_dscale =
NpuOpRunner("Cast", {*tmp_dscale}, {*dscale},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_dscale.Run(stream);
......@@ -361,7 +361,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
if (x->type() == framework::proto::VarType::FP16 &&
dbias->type() == framework::proto::VarType::FP32) {
auto dst_dtype = ConvertToNpuDtype(dbias->type());
auto runner_cast_dbias =
const auto& runner_cast_dbias =
NpuOpRunner("Cast", {*tmp_dbias}, {*dbias},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_dbias.Run(stream);
......
......@@ -41,7 +41,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
output_t->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
auto runner =
const auto &runner =
NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -65,14 +65,14 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner_zeros =
const auto &runner_zeros =
NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
runner_zeros.Run(stream);
// NOTE(zhiqiu): It seems in cann 20.1, the first input and output
// can be different tensor, but in cann 20.2+, it does inplace operation.
// Thus, the first input and output should be same tensor.
auto runner_scatter =
const auto &runner_scatter =
NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
{*table_grad_t}, {{"use_locking", true}});
runner_scatter.Run(stream);
......
......@@ -34,7 +34,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
if (x->dims().size() == 2) {
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner(
const auto& runner = NpuOpRunner(
"MatMul", {*x, *y}, {*out},
{{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
......@@ -46,7 +46,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
} else if (x->dims().size() > 2) {
out->mutable_data<T>(ctx.GetPlace());
auto runner =
const auto& runner =
NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
{{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
......@@ -76,7 +76,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
if (transpose_y) {
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx =
const auto& runner_dx =
NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", false}});
......@@ -84,7 +84,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy =
const auto& runner_dy =
NpuOpRunner("MatMul", {*dout, *x}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}});
......@@ -94,7 +94,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
} else {
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx =
const auto& runner_dx =
NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}});
......@@ -102,7 +102,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy =
const auto& runner_dy =
NpuOpRunner("MatMul", {*x, *dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}});
......@@ -113,30 +113,34 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
if (transpose_y) {
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
{{"adj_x1", false}, {"adj_x2", false}});
const auto& runner_dx =
NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
{{"adj_x1", false}, {"adj_x2", false}});
runner_dx.Run(stream);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
{{"adj_x1", true}, {"adj_x2", false}});
const auto& runner_dy =
NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
{{"adj_x1", true}, {"adj_x2", false}});
runner_dy.Run(stream);
}
} else {
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
{{"adj_x1", false}, {"adj_x2", true}});
const auto& runner_dx =
NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
{{"adj_x1", false}, {"adj_x2", true}});
runner_dx.Run(stream);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
{{"adj_x1", true}, {"adj_x2", false}});
const auto& runner_dy =
NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
{{"adj_x1", true}, {"adj_x2", false}});
runner_dy.Run(stream);
}
}
......
......@@ -30,7 +30,7 @@ class MeanNPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -61,7 +61,7 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
// ones
Tensor ones(grad->type());
ones.mutable_data<T>(IG->dims(), context.GetPlace());
auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
runner_ones.Run(stream);
// means
......@@ -75,11 +75,12 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
Tensor mean_ma(grad->type());
mean_ma.Resize(IG->dims());
mean_ma.mutable_data<T>(context.GetPlace());
auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
const auto& runner_mul_1 =
NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
runner_mul_1.Run(stream);
// and mul grad
auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
runner_mul_2.Run(stream);
}
};
......
......@@ -47,7 +47,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
if (indices->type() != framework::proto::VarType::INT32) {
cast_indices.Resize(indices->dims());
cast_indices.mutable_data<int>(ctx.GetPlace());
auto runner_cast_indices =
const auto& runner_cast_indices =
NpuOpRunner("Cast", {*indices}, {cast_indices},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_indices.Run(stream);
......@@ -57,7 +57,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
if (label->type() != framework::proto::VarType::INT32) {
cast_label.Resize(label->dims());
cast_label.mutable_data<int>(ctx.GetPlace());
auto runner_cast_label =
const auto& runner_cast_label =
NpuOpRunner("Cast", {*label}, {cast_label},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream);
......@@ -73,7 +73,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_equal(framework::proto::VarType::BOOL);
tmp_equal.Resize(inference->dims());
tmp_equal.mutable_data<bool>(ctx.GetPlace());
auto runner_equal =
const auto& runner_equal =
NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
runner_equal.Run(stream);
......@@ -81,7 +81,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_equal_cast(framework::proto::VarType::FP32);
tmp_equal_cast.Resize(inference->dims());
tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
auto runner_cast_equal = NpuOpRunner(
const auto& runner_cast_equal = NpuOpRunner(
"Cast", {tmp_equal}, {tmp_equal_cast},
{{"dst_type",
static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}});
......@@ -92,7 +92,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_correct_max(framework::proto::VarType::FP32);
tmp_correct_max.Resize(framework::make_ddim({num_samples}));
tmp_correct_max.mutable_data<float>(ctx.GetPlace());
auto runner_reduce_max =
const auto& runner_reduce_max =
NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max},
{{"axes", std::vector<int>{1}}, {"keep_dims", false}});
runner_reduce_max.Run(stream);
......@@ -101,14 +101,14 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
Tensor tmp_correct(framework::proto::VarType::FP32);
tmp_correct.Resize(correct->dims());
tmp_correct.mutable_data<float>(ctx.GetPlace());
auto runner_reduce_sum =
const auto& runner_reduce_sum =
NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct},
{{"axes", std::vector<int>{0}}, {"keep_dims", false}});
runner_reduce_sum.Run(stream);
// cast to int
correct->mutable_data<int>(ctx.GetPlace());
auto runner_cast_correct = NpuOpRunner(
const auto& runner_cast_correct = NpuOpRunner(
"Cast", {tmp_correct}, {*correct},
{{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}});
runner_cast_correct.Run(stream);
......@@ -126,7 +126,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
// [accuracy]
accuracy->mutable_data<float>(ctx.GetPlace());
auto runner_accuracy =
const auto& runner_accuracy =
NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
runner_accuracy.Run(stream);
}
......
......@@ -36,7 +36,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
if (x_num_col_dims == 1 && y_num_col_dims == 1) {
if (x->dims().size() == 2 && y->dims().size() == 2) {
out->mutable_data<T>(ctx.GetPlace());
auto runner =
const auto& runner =
NpuOpRunner("MatMul", {*x, *y}, {*out},
{{"transpose_x1", false}, {"transpose_x2", false}});
......@@ -54,7 +54,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
out->mutable_data<T>(ctx.GetPlace());
// matmul
auto runner =
const auto& runner =
NpuOpRunner("MatMul", {tmp_x, *y}, {*out},
{{"transpose_x1", false}, {"transpose_x2", false}});
runner.Run(stream);
......@@ -85,7 +85,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]}));
tmp_matmul.mutable_data<T>(ctx.GetPlace());
auto runner_matmul =
const auto& runner_matmul =
NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul},
{{"transpose_x1", false}, {"transpose_x2", false}});
......@@ -121,7 +121,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
if (x->dims().size() == 2 && y->dims().size() == 2) {
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
auto runner_dx =
const auto& runner_dx =
NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}});
......@@ -130,7 +130,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy =
const auto& runner_dy =
NpuOpRunner("MatMul", {*x, *dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}});
......@@ -144,7 +144,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
dx->mutable_data<T>(ctx.GetPlace());
auto dx_dims = dx->dims();
dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]}));
auto runner_matmul =
const auto& runner_matmul =
NpuOpRunner("MatMul", {*dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}});
runner_matmul.Run(stream);
......@@ -164,7 +164,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<platform::DeviceContext>(), &tmp_x);
tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy =
const auto& runner_dy =
NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}});
......@@ -193,7 +193,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
dx->mutable_data<T>(ctx.GetPlace());
auto dx_dims = dx->dims();
dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]}));
auto runner_matmul =
const auto& runner_matmul =
NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx},
{{"transpose_x1", false}, {"transpose_x2", true}});
runner_matmul.Run(stream);
......@@ -213,7 +213,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
// mamtul [6,4] [6,5] =>[4,5]
dy->mutable_data<T>(ctx.GetPlace());
auto runner_dy =
const auto& runner_dy =
NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy},
{{"transpose_x1", true}, {"transpose_x2", false}});
runner_dy.Run(stream);
......
......@@ -89,7 +89,21 @@ NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
}
NpuOpRunner::~NpuOpRunner() {
// TODO(zhiqiu): handle free
VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_;
// Is it safe to free the descs/buffers after run called in host ?
aclopDestroyAttr(attr_); // return void
for (auto desc : input_descs_) {
aclDestroyTensorDesc(desc);
}
for (auto desc : output_descs_) {
aclDestroyTensorDesc(desc);
}
for (auto buffer : input_buffers_) {
PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
}
for (auto buffer : output_buffers_) {
PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
}
}
const std::string &NpuOpRunner::Type() { return op_type_; }
......@@ -186,6 +200,8 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
}
NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
input_descs_.reserve(tensors.size());
input_buffers_.reserve(tensors.size());
for (auto tensor : tensors) {
// create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(tensor));
......@@ -211,6 +227,8 @@ NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
}
NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
output_descs_.reserve(tensors.size());
output_buffers_.reserve(tensors.size());
for (auto tensor : tensors) {
// create aclTensorDesc
output_descs_.emplace_back(CreateTensorDesc(tensor));
......@@ -281,12 +299,12 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
return buffer;
}
void NpuOpRunner::Run(aclrtStream stream) {
void NpuOpRunner::Run(aclrtStream stream) const {
if (!stream) {
VLOG(4) << "Run with default current npu stream: " << stream;
stream = GetCurrentNPUStream();
}
VLOG(5) << "NpuOpRunner(" << this << ") Run:";
VLOG(4) << "op_type: " << op_type_;
VLOG(4) << "input_desc.size: " << input_descs_.size();
VLOG(4) << "output_desc.size: " << output_descs_.size();
......
......@@ -41,6 +41,14 @@ class NpuOpRunner {
const std::vector<Tensor> &outputs = {},
const NPUAttributeMap &attrs = {});
// NOTE(zhiqiu): why forbid copy and operator= ?
// Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
// if shallow copy is performed on tensor_descs and data_buffers, it may
// result
// in use-after-free bugs.
NpuOpRunner(const NpuOpRunner &runner) = delete;
NpuOpRunner &operator=(const NpuOpRunner &runner) = delete;
~NpuOpRunner();
const std::string &Type();
......@@ -71,7 +79,7 @@ class NpuOpRunner {
std::vector<aclDataBuffer *> &GetOutputBuffers();
void Run(aclrtStream stream = nullptr);
void Run(aclrtStream stream = nullptr) const;
private:
aclTensorDesc *CreateTensorDesc(Tensor tensor);
......
......@@ -147,7 +147,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner =
const auto& runner =
NpuOpRunner("ApplyAdamD",
{
*param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
......@@ -179,10 +179,10 @@ class AdamNPUKernel : public framework::OpKernel<T> {
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
auto runner_m1 =
const auto& runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream);
auto runner_m2 =
const auto& runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
}
......
......@@ -32,7 +32,7 @@ class SGDNPUKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace());
auto runner =
const auto& runner =
NpuOpRunner("ApplyGradientDescent",
{*param_var, *learning_rate, *grad_var}, {*param_out}, {});
......
......@@ -38,7 +38,7 @@ class ReduceAnyNPUKernel : public framework::OpKernel<T> {
// set attr
NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......
......@@ -51,7 +51,7 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
cast_x.Resize(x->dims());
cast_x.mutable_data<float>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
auto runner_cast = NpuOpRunner(
const auto& runner_cast = NpuOpRunner(
"Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
......@@ -68,20 +68,22 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
dim_vec.push_back(i);
}
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
const auto& runner =
NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
runner.Run(stream);
} else {
auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
const auto& runner =
NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
runner.Run(stream);
}
if (x->type() != framework::proto::VarType::FP32 &&
x->type() != framework::proto::VarType::FP16) {
auto dst_dtype = ConvertToNpuDtype(out->type());
auto runner_cast =
const auto& runner_cast =
NpuOpRunner("Cast", {cast_out}, {*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
......@@ -107,8 +109,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (keep_dims || reduce_all) {
auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
{{"shape", framework::vectorize(x->dims())}});
const auto& runner =
NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
{{"shape", framework::vectorize(x->dims())}});
runner.Run(stream);
} else {
framework::DDim out_dims;
......@@ -124,8 +127,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
&out_grad_tmp);
out_grad_tmp.Resize(out_dims);
auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
{{"shape", framework::vectorize(x->dims())}});
const auto& runner =
NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
{{"shape", framework::vectorize(x->dims())}});
runner.Run(stream);
}
}
......
......@@ -38,7 +38,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
<< " ,bias_after_scale:" << bias_after_scale;
if (bias_after_scale) {
out->mutable_data<T>(ctx.GetPlace());
auto runner =
const auto& runner =
NpuOpRunner("Power", {*x}, {*out},
{{"power", _power}, {"scale", scale}, {"shift", bias}});
......@@ -47,12 +47,13 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
Tensor tmp_x(x->type());
tmp_x.Resize(x->dims());
tmp_x.mutable_data<T>(ctx.GetPlace());
auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
const auto& runner_tmp =
NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
runner_tmp.Run(stream);
out->mutable_data<T>(ctx.GetPlace());
float _bias = 0.0;
auto runner =
const auto& runner =
NpuOpRunner("Power", {tmp_x}, {*out},
{{"power", _power}, {"scale", scale}, {"shift", _bias}});
runner.Run(stream);
......
......@@ -53,11 +53,11 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
.stream();
if (overwrite) {
auto runner_update = NpuOpRunner("TensorScatterUpdate",
{*x, *index, *updates}, {*out}, {});
const auto& runner_update = NpuOpRunner(
"TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
runner_update.Run(stream);
} else {
auto runner_add =
const auto& runner_add =
NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
runner_add.Run(stream);
}
......
......@@ -72,8 +72,8 @@ class SliceNPUKernel : public framework::OpKernel<T> {
UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
auto runner = NpuOpRunner("SliceD", {*input}, {*out},
{{"offsets", offsets}, {"size", size}});
const auto& runner = NpuOpRunner("SliceD", {*input}, {*out},
{{"offsets", offsets}, {"size", size}});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -111,7 +111,7 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner =
const auto& runner =
NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
runner.Run(stream);
}
......
......@@ -31,7 +31,7 @@ class SoftmaxNPUKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......@@ -71,8 +71,8 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
dX->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {};
auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut},
{*dX}, attr_input);
const auto& runner = NpuOpRunner(std::string("SoftmaxGrad"),
{tmp_out, tmp_dOut}, {*dX}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......
......@@ -47,7 +47,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
// softmax
softmax->mutable_data<T>(ctx.GetPlace());
auto runner_softmax =
const auto& runner_softmax =
NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
runner_softmax.Run(stream);
......@@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
tmp_labels.Resize(labels->dims());
tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
auto runner_cast_label =
const auto& runner_cast_label =
NpuOpRunner("Cast", {*labels}, {tmp_labels},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream);
......@@ -77,7 +77,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
tmp_onehot.Resize(logits->dims());
tmp_onehot.mutable_data<int>(ctx.GetPlace());
auto runner_onehot =
const auto& runner_onehot =
NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
{{"axis", -1}, {"depth", cls_num}});
runner_onehot.Run(stream);
......@@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
cast_onehot.Resize(tmp_onehot.dims());
cast_onehot.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(logits->type());
auto runner_cast_onehot =
const auto& runner_cast_onehot =
NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_onehot.Run(stream);
......@@ -102,8 +102,9 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
// SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size]
auto loss_dims = loss->dims();
loss->Resize({loss_dims[0]});
auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits",
{*logits, cast_onehot}, {*loss, backprop}, {});
const auto& runner_s =
NpuOpRunner("SoftmaxCrossEntropyWithLogits", {*logits, cast_onehot},
{*loss, backprop}, {});
runner_s.Run(stream);
loss->Resize(loss_dims);
}
......@@ -130,7 +131,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
tmp_labels.Resize(labels->dims());
tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
auto runner_cast_label =
const auto& runner_cast_label =
NpuOpRunner("Cast", {*labels}, {tmp_labels},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream);
......@@ -150,7 +151,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
tmp_onehot.Resize(softmax->dims());
tmp_onehot.mutable_data<int>(ctx.GetPlace());
auto runner_onehot =
const auto& runner_onehot =
NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
{{"axis", -1}, {"depth", cls_num}});
runner_onehot.Run(stream);
......@@ -160,7 +161,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
cast_onehot.Resize(tmp_onehot.dims());
cast_onehot.mutable_data<T>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(softmax->type());
auto runner_cast_onehot =
const auto& runner_cast_onehot =
NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_onehot.Run(stream);
......@@ -169,13 +170,13 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
Tensor tmp_sub(softmax->type());
tmp_sub.Resize(softmax->dims());
tmp_sub.mutable_data<T>(ctx.GetPlace());
auto runner_sub =
const auto& runner_sub =
NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {});
runner_sub.Run(stream);
// mul
logits_grad->mutable_data<T>(ctx.GetPlace());
auto runner_mul =
const auto& runner_mul =
NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {});
runner_mul.Run(stream);
}
......
......@@ -69,7 +69,7 @@ class StackNPUKernel : public framework::OpKernel<T> {
tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
tmp_stack.mutable_data<T>(ctx.GetPlace());
auto runner =
const auto& runner =
NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
runner.Run(stream);
......@@ -81,12 +81,12 @@ class StackNPUKernel : public framework::OpKernel<T> {
}
}
auto runner_trans_final =
const auto& runner_trans_final =
NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
runner_trans_final.Run(stream);
} else {
auto runner =
const auto& runner =
NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
runner.Run(stream);
}
......
......@@ -43,12 +43,12 @@ class SumNPUKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
const auto& runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
runner.Run(stream);
for (int i = 2; i < n; i++) {
runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
runner.Run(stream);
const auto& runner1 = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
runner1.Run(stream);
}
}
};
......
......@@ -67,8 +67,8 @@ class TopkNPUKernel : public framework::OpKernel<T> {
tmp_indices.mutable_data<int>(ctx.GetPlace());
// run ascend
auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
{*output, tmp_indices}, attr_input);
const auto& runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
{*output, tmp_indices}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......@@ -76,7 +76,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
// cast indices from INT32 to INT64
auto dst_dtype = ConvertToNpuDtype(indices->type());
auto runner_cast_indices =
const auto& runner_cast_indices =
NpuOpRunner("Cast", {tmp_indices}, {*indices},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_indices.Run(stream);
......
......@@ -29,7 +29,7 @@ class TransposeNPUKernel : public framework::OpKernel<T> {
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
framework::NPUAttributeMap attr_input = {{"perm", axis}};
out->mutable_data<T>(ctx.device_context().GetPlace());
auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......@@ -52,7 +52,8 @@ class TransposeGradNPUKernel : public framework::OpKernel<T> {
}
x_grad->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
const auto& runner =
NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......
......@@ -59,7 +59,7 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
auto runner = NpuOpRunner(
const auto& runner = NpuOpRunner(
"ParameterizedTruncatedNormal",
{shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out},
{{"seed", seed_var}});
......
......@@ -251,8 +251,8 @@ class TestNet(unittest.TestCase):
cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
@unittest.skipIf(not paddle.is_compiled_with_npu(),
......@@ -335,8 +335,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册