diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index 8bd76a9886c62aa8f655c4607625aae67cecc122..e4d90421513bf2bfd5f0db954d036b682f45a494 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -135,18 +135,18 @@ class UpdateLossScalingFunctor { }; template -class LazyZeroInputs { +class LazyZeros { public: void operator()(const platform::CPUDeviceContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - if (*found_inf_data) { - VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int num = out->numel(); + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int num = out->numel(); + if (*found_inf_data) { + VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --"; std::memset(out_data, 0, num * sizeof(T)); } } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index 4da45df7ecfdb900cdbcd71fb7de24de93bb3ec4..ee6186e1f9e6f2f9ea7228b480620af27ffbc2ca 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -32,6 +32,17 @@ __global__ void GpuUpdateLossScaling( updated_loss_scaling_data, good_out_data, bad_out_data); } +template +__global__ void FillIf(T* data, const int64_t num, const T value, + const bool* has_inf) { + if (*has_inf) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < num; i += blockDim.x * gridDim.x) { + data[i] = value; + } + } +} + template class UpdateLossScalingFunctor { public: @@ -50,26 +61,20 @@ class UpdateLossScalingFunctor { }; template -class LazyZeroInputs { +class LazyZeros { public: void operator()(const platform::CUDADeviceContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - const auto gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); - bool has_inf{false}; - memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data, - sizeof(bool), dev_ctx.stream()); - dev_ctx.Wait(); // wait async copy - if (has_inf) { - VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int num = out->numel(); - cudaMemsetAsync(out_data, 0, num * sizeof(T), dev_ctx.stream()); - } + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int64_t num = out->numel(); + int block = 1024; + int grid = (block - 1 + num) / block; + FillIf<<>>( + out_data, num, static_cast(0), found_inf_data); } } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h index ca23b72eff0e85ab94c4d1f11e986f69b4e2d776..89de9c645fb0a72d75312814bc3649157e324731 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.h +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h @@ -70,7 +70,7 @@ class UpdateLossScalingFunctor { }; template -class LazyZeroInputs { +class LazyZeros { public: void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, @@ -115,7 +115,7 @@ class UpdateLossScalingKernel : public framework::OpKernel { dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data); - LazyZeroInputs{}(dev_ctx, found_inf_data, xs, outs); + LazyZeros{}(dev_ctx, found_inf_data, xs, outs); } }; diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py index fb93334415c3046362090a143f6c15069793709a..56f49f60bde847c2345cb1d003e4ee949c43dce4 100644 --- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py +++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py @@ -35,7 +35,7 @@ class TestUpdateLossScalingOp(OpTest): } self.outputs = { - 'Out': [('out0', np.zeros_like(x))], + 'Out': [('out0', x)], 'LossScaling': self.prev_loss_scaling * self.incr_ratio, 'OutGoodSteps': self.zero_steps, 'OutBadSteps': self.zero_steps