未验证 提交 95d3ebc8 编写于 作者: N niuliling123 提交者: GitHub

Modified dropout Kernel with Kernel Primitive API (#40766)

上级 17b8335b
...@@ -35,143 +35,99 @@ limitations under the License. */ ...@@ -35,143 +35,99 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/functors.h" #include "paddle/phi/kernels/funcs/functors.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T1, typename T2 = T1, typename OutT = T1>
struct DstMaskGenerator {
const float dropout_prob_;
const bool is_upscale_in_train_;
using MT = typename details::MPTypeTrait<T1>::Type;
MT factor;
HOSTDEVICE inline DstMaskGenerator(const float dropout_prob,
const bool is_upscale_in_train)
: dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) {
factor = static_cast<MT>(1.0f / (1.0f - dropout_prob_));
}
template <typename T, typename MaskType> HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val,
__global__ void RandomGenerator(const size_t n, uint64_t seed, const T2* rand, int num) const {
const float dropout_prob, const T* src, static constexpr int kCount =
MaskType* mask, T* dst, phi::funcs::uniform_distribution<T2>::kReturnsCount;
bool is_upscale_in_train, uint64_t increment) { // 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
using MT = typename details::MPTypeTrait<T>::Type; #pragma unroll
int idx = blockDim.x * blockIdx.x + threadIdx.x; for (int i = 0; i < kCount; i++) {
#ifdef PADDLE_WITH_HIP if (rand[i] < dropout_prob_) {
hiprandStatePhilox4_32_10_t state; dst[i] = static_cast<T1>(0);
hiprand_init(seed, idx, increment, &state); dst[i + kCount] = dst[i];
#else } else {
curandStatePhilox4_32_10_t state; dst[i] = is_upscale_in_train_
curand_init(seed, idx, increment, &state); ? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
#endif : static_cast<T1>(src_val[i]);
dst[i + kCount] = static_cast<T1>(1);
MaskType mask_val; }
T dst_val;
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (; idx < n; idx += blockDim.x * gridDim.x) {
T src_val = src[idx];
#ifdef PADDLE_WITH_HIP
if (hiprand_uniform(&state) < dropout_prob) {
#else
if (curand_uniform(&state) < dropout_prob) {
#endif
mask_val = 0;
dst_val = 0;
} else {
mask_val = 1;
dst_val = is_upscale_in_train
? static_cast<T>(static_cast<MT>(src_val) * factor)
: src_val;
} }
mask[idx] = mask_val;
dst[idx] = dst_val;
} }
} };
template <typename T, typename MaskType, int VecSize> template <typename T, typename MaskType>
__global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
const float dropout_prob, const float dropout_prob,
const T* src, MaskType* mask, T* dst, const T* src, MaskType* mask, T* dst,
bool is_upscale_in_train, bool is_upscale_in_train,
uint64_t increment) { uint64_t increment,
using MT = typename details::MPTypeTrait<T>::Type; size_t main_offset) {
using LoadT = phi::AlignedVector<T, VecSize>; size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>; static constexpr int kCount =
phi::funcs::uniform_distribution<float>::kReturnsCount;
size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
hiprandStatePhilox4_32_10_t state; hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx, increment, &state); hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = hiprandStatePhilox4_32_10_t;
#else #else
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
curandStatePhilox4_32_10_t state; curandStatePhilox4_32_10_t state;
curand_init(seed, idx, increment, &state); curand_init(seed, idx + THREAD_ID_X, increment, &state);
#endif using SType = curandStatePhilox4_32_10_t;
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
LoadT src_val;
phi::Load<T, VecSize>(&src[i], &src_val);
#ifdef PADDLE_WITH_HIP
float4 rand = hiprand_uniform4(&state);
#else
float4 rand = curand_uniform4(&state);
#endif #endif
T dst_mask[kCount * 2]; // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
LoadT dst_val; float rands[kCount];
MaskLoadT mask_val; MaskType mask_result[kCount];
using Rand = phi::funcs::uniform_distribution<float>;
#pragma unroll using Cast = kps::IdentityFunctor<T>;
for (int j = 0; j < VecSize; j++) { int deal_size = BLOCK_NUM_X * kCount;
if ((&rand.x)[j] < dropout_prob) { auto dst_functor =
dst_val[j] = 0; DstMaskGenerator<T, float>(dropout_prob, is_upscale_in_train);
mask_val[j] = 0; size_t fix = idx * kCount;
} else { for (; fix < main_offset; fix += stride) {
dst_val[j] = is_upscale_in_train kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
? static_cast<T>(static_cast<MT>(src_val[j]) * factor) kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
: src_val[j]; &state);
mask_val[j] = 1; // dst
} kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
} &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
phi::Store<T, VecSize>(dst_val, &dst[i]); // mask
phi::Store<MaskType, VecSize>(mask_val, &mask[i]); kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
deal_size);
} }
} int remainder = n - fix;
if (remainder > 0) {
template <typename T, typename MaskType> kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
struct CudaDropoutGradFunctor { kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
using MT = typename details::MPTypeTrait<T>::Type; &state);
// dst
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {} kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
__device__ __forceinline__ T operator()(const T dout, kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
const MaskType mask) const { // mask
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) * kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
factor_); &mask_result[0], &dst_mask[kCount], Cast());
} kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
remainder);
private:
MT factor_;
};
template <typename T, typename MaskType, int VecSize>
__global__ void DropoutGradCUDAKernel(
const T* dout, const MaskType* mask,
const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
T* dx) {
using MT = typename details::MPTypeTrait<T>::Type;
using LoadT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
LoadT dout_val;
phi::Load<T, VecSize>(&dout[i], &dout_val);
MaskLoadT mask_val;
phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
LoadT dx_val;
#pragma unroll
for (int j = 0; j < VecSize; j++) {
dx_val[j] = static_cast<T>(static_cast<MT>(dout_val[j]) *
static_cast<MT>(mask_val[j]) * factor);
}
phi::Store<T, VecSize>(dx_val, &dx[i]);
} }
} }
...@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, ...@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
uint64_t seed_data; uint64_t seed_data;
uint64_t increment; uint64_t increment;
// VectorizedRandomGenerator use curand_uniform4, so we only support // VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4; // kVecSize is 4;
int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1; constexpr int kVecSize =
phi::funcs::uniform_distribution<float>::kReturnsCount;
auto gpu_config = auto gpu_config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
auto offset = auto offset =
((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; ((x_numel - 1) / (gpu_config.GetThreadNum() * kVecSize) + 1) * kVecSize;
GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
&seed_data, &increment); &seed_data, &increment);
size_t main_offset = size / (gpu_config.GetBlockSize() * kVecSize) *
#ifdef __HIPCC__ (gpu_config.GetBlockSize() * kVecSize);
if (vec_size == 4 && size % 4 == 0) { VectorizedRandomGenerator<T, uint8_t><<<
hipLaunchKernelGGL( gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream>>>(
HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), size, seed_data, dropout_prob, x_data, mask_data, y_data,
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size, upscale_in_train, increment, main_offset);
seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
increment);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
stream, size, seed_data, dropout_prob, x_data,
mask_data, y_data, upscale_in_train, increment);
}
#else
if (vec_size == 4 && size % 4 == 0) {
VectorizedRandomGenerator<T, uint8_t, 4><<<
gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
} else {
RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
}
#endif
} else { } else {
if (upscale_in_train) { if (upscale_in_train) {
// todo: can y share with data with x directly? // todo: can y share with data with x directly?
...@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, ...@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
} }
} }
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
factor_);
}
private:
MT factor_;
};
template <typename T> template <typename T>
void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
const std::string dropout_implementation, const std::string dropout_implementation,
......
...@@ -114,13 +114,19 @@ struct normal_transform { ...@@ -114,13 +114,19 @@ struct normal_transform {
namespace kps = phi::kps; namespace kps = phi::kps;
/*********************** Distribution Function *************************/ /*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
template <typename T> template <typename T>
struct normal_distribution; struct normal_distribution;
#if defined(__NVCC__) #if defined(__NVCC__)
template <typename T>
struct uniform_distribution {
__device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
return static_cast<T>(curand_uniform(state));
}
static constexpr int kReturnsCount = 1;
};
template <> template <>
struct uniform_distribution<float> { struct uniform_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
...@@ -177,6 +183,14 @@ struct normal_distribution<double> { ...@@ -177,6 +183,14 @@ struct normal_distribution<double> {
}; };
#else #else
template <typename T>
struct uniform_distribution {
__device__ inline T operator()(hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform(state);
}
static constexpr int kReturnsCount = 1;
};
template <> template <>
struct uniform_distribution<float> { struct uniform_distribution<float> {
__device__ inline float4 operator()( __device__ inline float4 operator()(
......
...@@ -17,11 +17,10 @@ ...@@ -17,11 +17,10 @@
#include <thrust/reverse.h> #include <thrust/reverse.h>
#include <thrust/scan.h> #include <thrust/scan.h>
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_grad_kernel.h" #include "paddle/phi/kernels/masked_select_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
template <typename MT, typename InT, typename OutT> template <typename MT, typename InT, typename OutT>
...@@ -50,7 +49,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx, ...@@ -50,7 +49,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
const DenseTensor& mask, const DenseTensor& mask,
DenseTensor* x_grad) { DenseTensor* x_grad) {
auto mask_size = mask.numel(); auto mask_size = mask.numel();
auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace()); dev_ctx.template Alloc<T>(x_grad);
if (mask_size <= 0) return; if (mask_size <= 0) return;
using Functor = MaskedSelectGradFunctor<bool, T, T>; using Functor = MaskedSelectGradFunctor<bool, T, T>;
phi::funcs::SelectKernel<bool, T, T, 2, Functor>( phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册