From d523dffbf96f24517b6ea8d851b785c96de0fd01 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 4 Jun 2021 13:18:33 +0800 Subject: [PATCH] [NPU] avoid tensor copy in check_finite_and_scale (#33244) --- .../amp/check_finite_and_unscale_op_npu.cc | 46 ++++++------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc index 53b91f540c..26280cd2bd 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { found_inf->mutable_data(ctx.GetPlace()); - bool found_inf_data = false; - auto stream = ctx.template device_context() .stream(); - // step1: inverse scale(RealDiv) + // step1: inverse scale Tensor const_tensor; const_tensor.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); @@ -66,7 +64,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { // NOTE(zhiqiu): Tensor tmp; tmp.mutable_data({8}, ctx.GetPlace()); - // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. // tmp is only placeholder. const auto& runner_float_status = @@ -81,39 +78,26 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { {{"axes", std::vector{0}}, {"keep_dims", true}}); runner_reduce_sum.Run(stream); - std::vector sum_vec; - TensorToVector( - sum, ctx.template device_context(), - &sum_vec); - found_inf_data = (sum_vec[0] > 1); - - VLOG(4) << "found_inf_data:" << found_inf_data; - + const auto& runner_greater = + NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {}); + runner_greater.Run(stream); + + // NOTE(zhiqiu): The normal logic is : + // out = in, if found_inf = true + // out = in/scale, if found_inf = false + // However, on NPU, in order to avoid stream sync, we do not copy the + // found_inf data to cpu to check whether to unscale or not. + // Instead, we do the Mul no matter found_inf or not. + // And, a fact is, only few steps contains nan/inf during training. for (size_t i = 0; i < xs.size(); ++i) { const auto* x = xs[i]; auto* out = outs[i]; out->mutable_data(ctx.GetPlace()); - if (!found_inf_data) { - // MatMul - const auto& runner_matmul = - NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); - runner_matmul.Run(stream); - } + const auto& runner_mul = + NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); + runner_mul.Run(stream); } - // set found_inf to true - VLOG(4) << "found overflow:" << found_inf_data; - Tensor found_inf_tensor; - found_inf_tensor.Resize({1}); - bool* is_found_inf = - found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); - *is_found_inf = found_inf_data; - - framework::TensorCopy( - found_inf_tensor, ctx.GetPlace(), - ctx.template device_context(), found_inf); - ctx.template device_context().Wait(); - const auto& runner_clear_status = NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); runner_clear_status.Run(stream); -- GitLab