未验证 提交 272b32fd 编写于 作者: L Li Min 提交者: GitHub

Replacing dropout eval eigen usage by cuda kernel (#40053)

* Replacing dropout eval eigen usage by cuda kernel
上级 a8e02ef1
......@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
bool is_fix_seed, int seed_val, const Tensor& x,
const Tensor* seed, Tensor* mask, Tensor* y) {
auto& place = *dev_ctx.eigen_device();
int64_t x_numel = x.numel();
auto stream = dev_ctx.stream();
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (!is_test) {
int64_t x_numel = x.numel();
auto stream = dev_ctx.stream();
auto* mask_data = mask->data<uint8_t>();
size_t size = phi::product(mask->dims());
auto* x_data = x.data<T>();
auto* y_data = y->data<T>();
if (dropout_prob == 1.0f) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
......@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
}
#endif
} else {
auto X = EigenMatrix<T>::Reshape(x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
if (upscale_in_train) {
Y.device(place) = X;
// todo: can y share with data with x directly?
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
hipMemcpyDeviceToDevice, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
cudaMemcpyDeviceToDevice, stream));
#endif
} else {
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
T factor = static_cast<T>(1.0f - dropout_prob);
std::vector<const framework::Tensor*> ins = {&x};
std::vector<framework::Tensor*> outs = {y};
auto functor = phi::funcs::ScaleFunctor<T>(factor);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
&outs, functor);
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册