From d523dffbf96f24517b6ea8d851b785c96de0fd01 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 4 Jun 2021 13:18:33 +0800
Subject: [PATCH] [NPU] avoid tensor copy in check_finite_and_scale (#33244)

---
 .../amp/check_finite_and_unscale_op_npu.cc    | 46 ++++++-------------
 1 file changed, 15 insertions(+), 31 deletions(-)
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 53b91f540c..26280cd2bd 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
 
     found_inf->mutable_data<bool>(ctx.GetPlace());
 
-    bool found_inf_data = false;
-
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // step1: inverse scale(RealDiv)
+    // step1: inverse scale
     Tensor const_tensor;
     const_tensor.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
@@ -66,7 +64,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     // NOTE(zhiqiu):
     Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
-
     // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
     // tmp is only placeholder.
     const auto& runner_float_status =
@@ -81,39 +78,26 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
                     {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
     runner_reduce_sum.Run(stream);
 
-    std::vector<float> sum_vec;
-    TensorToVector(
-        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        &sum_vec);
-    found_inf_data = (sum_vec[0] > 1);
-
-    VLOG(4) << "found_inf_data:" << found_inf_data;
-
+    const auto& runner_greater =
+        NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
+    runner_greater.Run(stream);
+
+    // NOTE(zhiqiu): The normal logic is :
+    // out = in, if found_inf = true
+    // out = in/scale, if found_inf = false
+    // However, on NPU, in order to avoid stream sync, we do not copy the
+    // found_inf data to cpu to check whether to unscale or not.
+    // Instead, we do the Mul no matter found_inf or not.
+    // And, a fact is, only few steps contains nan/inf during training.
     for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
-      if (!found_inf_data) {
-        // MatMul
-        const auto& runner_matmul =
-            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-        runner_matmul.Run(stream);
-      }
+      const auto& runner_mul =
+          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
+      runner_mul.Run(stream);
     }
 
-    // set found_inf to true
-    VLOG(4) << "found overflow:" << found_inf_data;
-    Tensor found_inf_tensor;
-    found_inf_tensor.Resize({1});
-    bool* is_found_inf =
-        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-    *is_found_inf = found_inf_data;
-
-    framework::TensorCopy(
-        found_inf_tensor, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), found_inf);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-
     const auto& runner_clear_status =
         NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
     runner_clear_status.Run(stream);
-- 
GitLab