From 5ac71b36fbbb071058a98b2bb4287e2374cc458a Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Thu, 10 Dec 2020 11:03:14 +0800 Subject: [PATCH] Remove tensor copy in the update_loss_scaling op. (#29426) * remove tensor copy in the update_loss_scaling op * not use thrust. * fix some cuda memory access error. --- .../operators/amp/update_loss_scaling_op.cc | 14 ++++---- .../operators/amp/update_loss_scaling_op.cu | 35 +++++++++++-------- .../operators/amp/update_loss_scaling_op.h | 4 +-- .../unittests/test_update_loss_scaling_op.py | 2 +- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index 8bd76a9886c..e4d90421513 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -135,18 +135,18 @@ class UpdateLossScalingFunctor { }; template -class LazyZeroInputs { +class LazyZeros { public: void operator()(const platform::CPUDeviceContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - if (*found_inf_data) { - VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int num = out->numel(); + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int num = out->numel(); + if (*found_inf_data) { + VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --"; std::memset(out_data, 0, num * sizeof(T)); } } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index 4da45df7ecf..ee6186e1f9e 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -32,6 +32,17 @@ __global__ void GpuUpdateLossScaling( updated_loss_scaling_data, good_out_data, bad_out_data); } +template +__global__ void FillIf(T* data, const int64_t num, const T value, + const bool* has_inf) { + if (*has_inf) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < num; i += blockDim.x * gridDim.x) { + data[i] = value; + } + } +} + template class UpdateLossScalingFunctor { public: @@ -50,26 +61,20 @@ class UpdateLossScalingFunctor { }; template -class LazyZeroInputs { +class LazyZeros { public: void operator()(const platform::CUDADeviceContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - const auto gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); - bool has_inf{false}; - memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data, - sizeof(bool), dev_ctx.stream()); - dev_ctx.Wait(); // wait async copy - if (has_inf) { - VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int num = out->numel(); - cudaMemsetAsync(out_data, 0, num * sizeof(T), dev_ctx.stream()); - } + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int64_t num = out->numel(); + int block = 1024; + int grid = (block - 1 + num) / block; + FillIf<<>>( + out_data, num, static_cast(0), found_inf_data); } } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h index ca23b72eff0..89de9c645fb 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.h +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h @@ -70,7 +70,7 @@ class UpdateLossScalingFunctor { }; template -class LazyZeroInputs { +class LazyZeros { public: void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, @@ -115,7 +115,7 @@ class UpdateLossScalingKernel : public framework::OpKernel { dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data); - LazyZeroInputs{}(dev_ctx, found_inf_data, xs, outs); + LazyZeros{}(dev_ctx, found_inf_data, xs, outs); } }; diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py index fb93334415c..56f49f60bde 100644 --- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py +++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py @@ -35,7 +35,7 @@ class TestUpdateLossScalingOp(OpTest): } self.outputs = { - 'Out': [('out0', np.zeros_like(x))], + 'Out': [('out0', x)], 'LossScaling': self.prev_loss_scaling * self.incr_ratio, 'OutGoodSteps': self.zero_steps, 'OutBadSteps': self.zero_steps -- GitLab