未验证 提交 d523dffb 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] avoid tensor copy in check_finite_and_scale (#33244)

上级 53d3f5eb
......@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
found_inf->mutable_data<bool>(ctx.GetPlace());
bool found_inf_data = false;
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// step1: inverse scale(RealDiv)
// step1: inverse scale
Tensor const_tensor;
const_tensor.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
......@@ -66,7 +64,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu):
Tensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace());
// NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
// tmp is only placeholder.
const auto& runner_float_status =
......@@ -81,39 +78,26 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
{{"axes", std::vector<int>{0}}, {"keep_dims", true}});
runner_reduce_sum.Run(stream);
std::vector<float> sum_vec;
TensorToVector(
sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
&sum_vec);
found_inf_data = (sum_vec[0] > 1);
VLOG(4) << "found_inf_data:" << found_inf_data;
const auto& runner_greater =
NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
runner_greater.Run(stream);
// NOTE(zhiqiu): The normal logic is :
// out = in, if found_inf = true
// out = in/scale, if found_inf = false
// However, on NPU, in order to avoid stream sync, we do not copy the
// found_inf data to cpu to check whether to unscale or not.
// Instead, we do the Mul no matter found_inf or not.
// And, a fact is, only few steps contains nan/inf during training.
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
if (!found_inf_data) {
// MatMul
const auto& runner_matmul =
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_matmul.Run(stream);
}
const auto& runner_mul =
NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
runner_mul.Run(stream);
}
// set found_inf to true
VLOG(4) << "found overflow:" << found_inf_data;
Tensor found_inf_tensor;
found_inf_tensor.Resize({1});
bool* is_found_inf =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
*is_found_inf = found_inf_data;
framework::TensorCopy(
found_inf_tensor, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
const auto& runner_clear_status =
NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
runner_clear_status.Run(stream);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册