未验证 提交 5ac71b36 编写于 作者: Z Zhen Wang 提交者: GitHub

Remove tensor copy in the update_loss_scaling op. (#29426)

* remove tensor copy in the update_loss_scaling op

* not use thrust.

* fix some cuda memory access error.
上级 5fe1f8af
...@@ -135,18 +135,18 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> { ...@@ -135,18 +135,18 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
}; };
template <typename T> template <typename T>
class LazyZeroInputs<platform::CPUDeviceContext, T> { class LazyZeros<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& dev_ctx, void operator()(const platform::CPUDeviceContext& dev_ctx,
const bool* found_inf_data, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs, const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const { const std::vector<framework::Tensor*>& outs) const {
if (*found_inf_data) { for (size_t i = 0; i < xs.size(); ++i) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; auto* out = outs[i];
for (size_t i = 0; i < xs.size(); ++i) { T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
auto* out = outs[i]; int num = out->numel();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace()); if (*found_inf_data) {
int num = out->numel(); VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
std::memset(out_data, 0, num * sizeof(T)); std::memset(out_data, 0, num * sizeof(T));
} }
} }
......
...@@ -32,6 +32,17 @@ __global__ void GpuUpdateLossScaling( ...@@ -32,6 +32,17 @@ __global__ void GpuUpdateLossScaling(
updated_loss_scaling_data, good_out_data, bad_out_data); updated_loss_scaling_data, good_out_data, bad_out_data);
} }
template <typename T>
__global__ void FillIf(T* data, const int64_t num, const T value,
const bool* has_inf) {
if (*has_inf) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < num; i += blockDim.x * gridDim.x) {
data[i] = value;
}
}
}
template <typename T> template <typename T>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> { class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
public: public:
...@@ -50,26 +61,20 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> { ...@@ -50,26 +61,20 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
}; };
template <typename T> template <typename T>
class LazyZeroInputs<platform::CUDADeviceContext, T> { class LazyZeros<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& dev_ctx, void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs, const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const { const std::vector<framework::Tensor*>& outs) const {
const auto gpu_place = for (size_t i = 0; i < xs.size(); ++i) {
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); auto* out = outs[i];
bool has_inf{false}; T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data, int64_t num = out->numel();
sizeof(bool), dev_ctx.stream()); int block = 1024;
dev_ctx.Wait(); // wait async copy int grid = (block - 1 + num) / block;
if (has_inf) { FillIf<<<grid, block, 0, dev_ctx.stream()>>>(
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --"; out_data, num, static_cast<T>(0), found_inf_data);
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
cudaMemsetAsync(out_data, 0, num * sizeof(T), dev_ctx.stream());
}
} }
} }
}; };
......
...@@ -70,7 +70,7 @@ class UpdateLossScalingFunctor { ...@@ -70,7 +70,7 @@ class UpdateLossScalingFunctor {
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LazyZeroInputs { class LazyZeros {
public: public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data, void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs, const std::vector<const framework::Tensor*>& xs,
...@@ -115,7 +115,7 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> { ...@@ -115,7 +115,7 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data, dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data); decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs); LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
} }
}; };
......
...@@ -35,7 +35,7 @@ class TestUpdateLossScalingOp(OpTest): ...@@ -35,7 +35,7 @@ class TestUpdateLossScalingOp(OpTest):
} }
self.outputs = { self.outputs = {
'Out': [('out0', np.zeros_like(x))], 'Out': [('out0', x)],
'LossScaling': self.prev_loss_scaling * self.incr_ratio, 'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps, 'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps 'OutBadSteps': self.zero_steps
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册