Remove tensor copy in the update_loss_scaling op. (#29426)

* remove tensor copy in the update_loss_scaling op * not use thrust. * fix some cuda memory access error.

Remove tensor copy in the update_loss_scaling op. (#29426)
* remove tensor copy in the update_loss_scaling op * not use thrust. * fix some cuda memory access error.
5ac71b36 · Zhen Wang · GitHub · 5fe1f8af · 5ac71b36 · 5ac71b36
4 changed file
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -135,18 +135,18 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
 };

 template <typename T>
-class LazyZeroInputs<platform::CPUDeviceContext, T> {
+class LazyZeros<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& dev_ctx,
                  const bool* found_inf_data,
                  const std::vector<const framework::Tensor*>& xs,
                  const std::vector<framework::Tensor*>& outs) const {
-    if (*found_inf_data) {
-      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
-      for (size_t i = 0; i < xs.size(); ++i) {
-        auto* out = outs[i];
-        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-        int num = out->numel();
+    for (size_t i = 0; i < xs.size(); ++i) {
+      auto* out = outs[i];
+      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+      int num = out->numel();
+      if (*found_inf_data) {
+        VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
        std::memset(out_data, 0, num * sizeof(T));
      }
    }

--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -32,6 +32,17 @@ __global__ void GpuUpdateLossScaling(
            updated_loss_scaling_data, good_out_data, bad_out_data);
 }

+template <typename T>
+__global__ void FillIf(T* data, const int64_t num, const T value,
+                       const bool* has_inf) {
+  if (*has_inf) {
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    for (int i = tid; i < num; i += blockDim.x * gridDim.x) {
+      data[i] = value;
+    }
+  }
+}
+
 template <typename T>
 class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
 public:
@@ -50,26 +61,20 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
 };

 template <typename T>
-class LazyZeroInputs<platform::CUDADeviceContext, T> {
+class LazyZeros<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& dev_ctx,
                  const bool* found_inf_data,
                  const std::vector<const framework::Tensor*>& xs,
                  const std::vector<framework::Tensor*>& outs) const {
-    const auto gpu_place =
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
-    bool has_inf{false};
-    memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
-                 sizeof(bool), dev_ctx.stream());
-    dev_ctx.Wait();  // wait async copy
-    if (has_inf) {
-      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
-      for (size_t i = 0; i < xs.size(); ++i) {
-        auto* out = outs[i];
-        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-        int num = out->numel();
-        cudaMemsetAsync(out_data, 0, num * sizeof(T), dev_ctx.stream());
-      }
+    for (size_t i = 0; i < xs.size(); ++i) {
+      auto* out = outs[i];
+      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+      int64_t num = out->numel();
+      int block = 1024;
+      int grid = (block - 1 + num) / block;
+      FillIf<<<grid, block, 0, dev_ctx.stream()>>>(
+          out_data, num, static_cast<T>(0), found_inf_data);
    }
  }
 };

--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -70,7 +70,7 @@ class UpdateLossScalingFunctor {
 };

 template <typename DeviceContext, typename T>
-class LazyZeroInputs {
+class LazyZeros {
 public:
  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
                  const std::vector<const framework::Tensor*>& xs,
@@ -115,7 +115,7 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
-    LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+    LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
  }
 };


--- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -35,7 +35,7 @@ class TestUpdateLossScalingOp(OpTest):
        }

        self.outputs = {
-            'Out': [('out0', np.zeros_like(x))],
+            'Out': [('out0', x)],
            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
            'OutGoodSteps': self.zero_steps,
            'OutBadSteps': self.zero_steps