未验证 提交 5ac71b36 编写于 作者: Z Zhen Wang 提交者: GitHub

Remove tensor copy in the update_loss_scaling op. (#29426)

* remove tensor copy in the update_loss_scaling op

* not use thrust.

* fix some cuda memory access error.
上级 5fe1f8af
......@@ -135,18 +135,18 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
};
template <typename T>
class LazyZeroInputs<platform::CPUDeviceContext, T> {
class LazyZeros<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
if (*found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
if (*found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
std::memset(out_data, 0, num * sizeof(T));
}
}
......
......@@ -32,6 +32,17 @@ __global__ void GpuUpdateLossScaling(
updated_loss_scaling_data, good_out_data, bad_out_data);
}
template <typename T>
__global__ void FillIf(T* data, const int64_t num, const T value,
const bool* has_inf) {
if (*has_inf) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < num; i += blockDim.x * gridDim.x) {
data[i] = value;
}
}
}
template <typename T>
class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
public:
......@@ -50,26 +61,20 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
};
template <typename T>
class LazyZeroInputs<platform::CUDADeviceContext, T> {
class LazyZeros<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& dev_ctx,
const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
const std::vector<framework::Tensor*>& outs) const {
const auto gpu_place =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
bool has_inf{false};
memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
sizeof(bool), dev_ctx.stream());
dev_ctx.Wait(); // wait async copy
if (has_inf) {
VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
cudaMemsetAsync(out_data, 0, num * sizeof(T), dev_ctx.stream());
}
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int64_t num = out->numel();
int block = 1024;
int grid = (block - 1 + num) / block;
FillIf<<<grid, block, 0, dev_ctx.stream()>>>(
out_data, num, static_cast<T>(0), found_inf_data);
}
}
};
......
......@@ -70,7 +70,7 @@ class UpdateLossScalingFunctor {
};
template <typename DeviceContext, typename T>
class LazyZeroInputs {
class LazyZeros {
public:
void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
const std::vector<const framework::Tensor*>& xs,
......@@ -115,7 +115,7 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
}
};
......
......@@ -35,7 +35,7 @@ class TestUpdateLossScalingOp(OpTest):
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'Out': [('out0', x)],
'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册