未验证 提交 65e3fa35 编写于 作者: B Bo Zhang 提交者: GitHub

dropout_nd_optimization (#51479)

* with printf

* add DropOutNdForwardKernel

* PR comment
上级 c74aaf67
......@@ -33,15 +33,68 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/functors.h"
#include "paddle/phi/kernels/primitive/compute_primitives.h"
#include "paddle/phi/kernels/primitive/datamover_primitives.h"
namespace phi {
namespace funcs {
template <typename T1, typename T2 = T1, typename OutT = T1>
template <typename T>
struct DstFunctor {
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
MT factor;
HOSTDEVICE inline DstFunctor(const float retain_prob,
const bool is_upscale_in_train,
const int64_t num)
: retain_prob_(retain_prob),
is_upscale_in_train_(is_upscale_in_train),
num_(num) {
factor = static_cast<MT>(1.0f / retain_prob_);
}
HOSTDEVICE inline T operator()(const T src_val, const uint8_t mask) const {
for (int i = 0; i < num_; i++) {
if (mask == static_cast<uint8_t>(1)) {
return is_upscale_in_train_
? static_cast<T>(static_cast<MT>(src_val) * factor)
: static_cast<T>(src_val);
} else {
return static_cast<T>(0);
}
}
}
private:
const float retain_prob_;
const bool is_upscale_in_train_;
const int64_t num_;
};
template <typename T>
struct MaskFunctor {
const float retain_prob_;
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
MT factor;
HOSTDEVICE inline MaskFunctor(const float retain_prob)
: retain_prob_(retain_prob) {
factor = static_cast<MT>(1.0f / retain_prob_);
}
HOSTDEVICE inline void operator()(T* dst, const float* rand, int num) const {
static constexpr int kCount =
phi::funcs::uniform_distribution<float>::kReturnsCount;
// 0 ~ kCount - 1 is dst, kCount ~ 2 * kCount - 1 is mask
#pragma unroll
for (int i = 0; i < kCount; i++) {
dst[i] = rand[i] < retain_prob_ ? static_cast<T>(1) : static_cast<T>(0);
}
}
};
template <typename T>
struct DstMaskFunctor {
const float retain_prob_;
const bool is_upscale_in_train_;
using MT = typename phi::kps::details::MPTypeTrait<T1>::Type;
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
MT factor;
HOSTDEVICE inline DstMaskFunctor(const float retain_prob,
const bool is_upscale_in_train)
......@@ -49,34 +102,34 @@ struct DstMaskFunctor {
factor = static_cast<MT>(1.0f / retain_prob_);
}
HOSTDEVICE inline void operator()(OutT* dst,
const T1* src_val,
const T2* rand,
HOSTDEVICE inline void operator()(T* dst,
const T* src_val,
const float* rand,
int num) const {
static constexpr int kCount =
phi::funcs::uniform_distribution<T2>::kReturnsCount;
// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
phi::funcs::uniform_distribution<float>::kReturnsCount;
// 0 ~ kCount - 1 is dst, kCount ~ 2 * kCount - 1 is mask
#pragma unroll
for (int i = 0; i < kCount; i++) {
if (rand[i] < retain_prob_) {
dst[i] = is_upscale_in_train_
? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
: static_cast<T1>(src_val[i]);
dst[i + kCount] = static_cast<T1>(1);
? static_cast<T>(static_cast<MT>(src_val[i]) * factor)
: static_cast<T>(src_val[i]);
dst[i + kCount] = static_cast<T>(1);
} else {
dst[i] = static_cast<T1>(0);
dst[i] = static_cast<T>(0);
dst[i + kCount] = dst[i];
}
}
}
};
template <typename T, typename MaskType>
template <typename T>
__global__ void VectorizedRandomGenerator(const size_t n,
uint64_t seed,
const float dropout_prob,
const T* src,
MaskType* mask,
uint8_t* mask,
T* dst,
bool is_upscale_in_train,
uint64_t increment,
......@@ -94,9 +147,10 @@ __global__ void VectorizedRandomGenerator(const size_t n,
curand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = curandStatePhilox4_32_10_t;
#endif
T dst_mask[kCount * 2]; // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
T dst_mask[kCount *
2]; // 0 ~ kCount - 1 : dst, kCount ~ 2 * kCount - 1: mask
float rands[kCount];
MaskType mask_result[kCount];
uint8_t mask_result[kCount];
using Rand = phi::funcs::uniform_distribution<float>;
using Cast = kps::IdentityFunctor<T>;
int deal_size = BLOCK_NUM_X * kCount;
......@@ -104,19 +158,19 @@ __global__ void VectorizedRandomGenerator(const size_t n,
size_t fix = idx * kCount;
auto dst_functor =
DstMaskFunctor<T, float>(1.0f - dropout_prob, is_upscale_in_train);
DstMaskFunctor<T>(1.0f - dropout_prob, is_upscale_in_train);
for (; fix < main_offset; fix += stride) {
kps::ReadData<T, kCount, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
kps::OperatorTernary<T, float, T, DstMaskFunctor<T>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, false>(dst + fix, &dst_mask[0], deal_size);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
kps::ElementwiseUnary<T, uint8_t, kCount, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, false>(
kps::WriteData<uint8_t, kCount, 1, false>(
mask + fix, &mask_result[0], deal_size);
if (fix > idx * kCount + 1) {
__syncthreads();
......@@ -128,82 +182,33 @@ __global__ void VectorizedRandomGenerator(const size_t n,
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
kps::OperatorTernary<T, float, T, DstMaskFunctor<T>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, true>(dst + fix, &dst_mask[0], remainder);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
kps::ElementwiseUnary<T, uint8_t, kCount, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, true>(
kps::WriteData<uint8_t, kCount, 1, true>(
mask + fix, &mask_result[0], remainder);
__syncthreads();
}
}
template <typename T1, typename T2 = T1, typename OutT = T1>
struct MaskFunctor {
const float retain_prob_;
using MT = typename phi::kps::details::MPTypeTrait<T1>::Type;
MT factor;
HOSTDEVICE inline MaskFunctor(const float retain_prob)
: retain_prob_(retain_prob) {
factor = static_cast<MT>(1.0f / retain_prob_);
}
HOSTDEVICE inline void operator()(OutT* dst, const T2* rand, int num) const {
static constexpr int kCount =
phi::funcs::uniform_distribution<T2>::kReturnsCount;
// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
#pragma unroll
for (int i = 0; i < kCount; i++) {
if (rand[i] < retain_prob_) {
dst[i] = static_cast<T1>(1);
} else {
dst[i] = static_cast<T1>(0);
}
}
}
};
template <typename T, typename MaskType>
struct DstFunctor {
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
MT factor;
HOSTDEVICE inline DstFunctor(const float retain_prob,
const bool is_upscale_in_train,
const int64_t num)
: retain_prob_(retain_prob),
is_upscale_in_train_(is_upscale_in_train),
num_(num) {
factor = static_cast<MT>(1.0f / retain_prob_);
}
HOSTDEVICE inline T operator()(const T src_val, const MaskType mask) const {
for (int i = 0; i < num_; i++) {
if (mask == static_cast<MaskType>(1)) {
return is_upscale_in_train_
? static_cast<T>(static_cast<MT>(src_val) * factor)
: static_cast<T>(src_val);
} else {
return static_cast<T>(0);
}
}
}
private:
const float retain_prob_;
const bool is_upscale_in_train_;
const int64_t num_;
};
template <typename T, typename MaskType>
__global__ void VectorizedGeneratorMask(const size_t n,
uint64_t seed,
const float dropout_prob,
const T* src,
MaskType* mask,
uint64_t increment,
size_t main_offset) {
template <typename T>
__global__ void DropOutNdForwardKernel(
const size_t n,
uint64_t seed,
const float dropout_prob,
const T* src,
uint8_t* mask,
uint64_t increment,
size_t main_offset,
DstFunctor<T> dst_functor,
T* y,
int64_t N,
kps::details::BroadcastConfig broadcast_config) {
// Vectorized Generate Mask
// kCount is 4 for curand_uniform4 is used
constexpr int kCount = phi::funcs::uniform_distribution<float>::kReturnsCount;
size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
......@@ -216,28 +221,28 @@ __global__ void VectorizedGeneratorMask(const size_t n,
curand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = curandStatePhilox4_32_10_t;
#endif
T dst_mask[kCount]; // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
T dst_mask[kCount]; // 0 ~ kCount - 1 : dst, kCount ~ 2 * kCount - 1: mask
float rands[kCount];
MaskType mask_result[kCount];
uint8_t mask_result[kCount];
using Rand = phi::funcs::uniform_distribution<float>;
using Cast = kps::IdentityFunctor<T>;
int deal_size = BLOCK_NUM_X * kCount;
size_t fix = idx * kCount;
auto mask_functor = MaskFunctor<T, float>(1.0f - dropout_prob);
auto mask_functor = MaskFunctor<T>(1.0f - dropout_prob);
for (; fix < main_offset; fix += stride) {
kps::ReadData<T, kCount, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
kps::OperatorBinary<float, T, MaskFunctor<T>>(
&dst_mask[0], &rands[0], mask_functor, kCount);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
kps::ElementwiseUnary<T, uint8_t, kCount, 1, Cast>(
&mask_result[0], &dst_mask[0], Cast());
kps::WriteData<MaskType, kCount, 1, false>(
kps::WriteData<uint8_t, kCount, 1, false>(
mask + fix, &mask_result[0], deal_size);
if (fix > idx * kCount + 1) {
__syncthreads();
......@@ -249,28 +254,30 @@ __global__ void VectorizedGeneratorMask(const size_t n,
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
kps::OperatorBinary<float, T, MaskFunctor<T>>(
&dst_mask[0], &rands[0], mask_functor, kCount);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
kps::ElementwiseUnary<T, uint8_t, kCount, 1, Cast>(
&mask_result[0], &dst_mask[0], Cast());
kps::WriteData<MaskType, kCount, 1, true>(
kps::WriteData<uint8_t, kCount, 1, true>(
mask + fix, &mask_result[0], remainder);
__syncthreads();
}
}
inline void CalcBroadcastedMask(const phi::GPUContext& dev_ctx,
const phi::DenseTensor& mask,
phi::DenseTensor* broadcasted_mask) {
// The broadcast of mask can be combined to the following ElementwiseKernel
// when the BroadcastKernel supports different input types.
dev_ctx.template Alloc<uint8_t>(broadcasted_mask);
std::vector<const phi::DenseTensor*> ins = {&mask};
std::vector<phi::DenseTensor*> outs = {broadcasted_mask};
phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary, uint8_t, uint8_t>(
dev_ctx, ins, &outs, -1, kps::IdentityFunctor<uint8_t>());
// Broadcast mask data and do elementwise operaiton with DstFunctor
CUDA_KERNEL_LOOP(i, N) {
uint32_t offset = 0u;
uint32_t idx = i;
// Use (j < phi::DDim::kMaxRank) conditiion rather than
// (j < broadcast_config.rank) for (#pragma unroll)
#pragma unroll
for (int j = 0; j < phi::DDim::kMaxRank; ++j) {
if (j == broadcast_config.rank) break;
auto fast_divmoder = broadcast_config.divmoders[j].Divmod(idx);
idx = fast_divmoder.val[0];
offset += broadcast_config.strides[j] * fast_divmoder.val[1];
}
y[i] = dst_functor(src[i], mask[offset]);
}
}
template <typename T, typename MT>
......@@ -285,17 +292,19 @@ void ScaleByDropoutFactor(const phi::GPUContext& dev_ctx,
}
template <typename T>
void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
bool is_test,
float dropout_prob,
bool upscale_in_train,
bool is_fix_seed,
int seed_val,
const phi::DenseTensor& x,
const phi::DenseTensor* seed,
phi::DenseTensor* mask,
phi::DenseTensor* y,
bool is_dropout_nd = false) {
void DropoutFwGPUKernelDriver(
const phi::GPUContext& dev_ctx,
bool is_test,
float dropout_prob,
bool upscale_in_train,
bool is_fix_seed,
int seed_val,
const phi::DenseTensor& x,
const phi::DenseTensor* seed,
phi::DenseTensor* mask,
phi::DenseTensor* y,
bool is_dropout_nd = false,
const std::vector<int>& axis = std::vector<int>()) {
int64_t x_numel = x.numel();
auto stream = dev_ctx.stream();
auto* x_data = x.data<T>();
......@@ -344,26 +353,32 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
size / (block_size * kVecSize) * (block_size * kVecSize);
if (is_dropout_nd) {
VectorizedGeneratorMask<T, uint8_t>
auto dst_functor =
DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
auto input_x_dims = x.dims();
auto mask_dims = mask->dims();
std::vector<int64_t> out_dims = phi::vectorize<int64_t>(input_x_dims);
std::vector<int64_t> in_dims = phi::vectorize<int64_t>(mask_dims);
reverse(out_dims.begin(), out_dims.end());
reverse(in_dims.begin(), in_dims.end());
kps::details::BroadcastConfig broadcast_config(
out_dims, in_dims, x.dims().size());
DropOutNdForwardKernel<T>
<<<grid_size, block_size, 0, stream>>>(size,
seed_data,
dropout_prob,
x_data,
mask_data,
increment,
main_offset);
phi::DenseTensor broadcasted_mask;
broadcasted_mask.Resize(x.dims());
CalcBroadcastedMask(dev_ctx, *mask, &broadcasted_mask);
auto dst_functor = DstFunctor<T, uint8_t>(
1.0f - dropout_prob, upscale_in_train, x_numel);
std::vector<const phi::DenseTensor*> ins = {&x, &broadcasted_mask};
std::vector<phi::DenseTensor*> outs = {y};
phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, dst_functor);
main_offset,
dst_functor,
y_data,
y->numel(),
broadcast_config);
} else {
#define PD_DROPOUT_KERNEL_NAME VectorizedRandomGenerator<T, uint8_t>
#define PD_DROPOUT_KERNEL_NAME VectorizedRandomGenerator<T>
PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(!is_fix_seed,
PD_DROPOUT_KERNEL_NAME,
grid_size,
......@@ -397,14 +412,14 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx,
}
}
template <typename T, typename MaskType>
template <typename T>
struct CudaDropoutGradFunctor {
using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
const uint8_t mask) const {
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
factor_);
}
......@@ -433,7 +448,17 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
phi::DenseTensor broadcasted_mask;
if (is_dropout_nd) {
broadcasted_mask.Resize(grad_y.dims());
CalcBroadcastedMask(dev_ctx, mask, &broadcasted_mask);
dev_ctx.template Alloc<uint8_t>(&broadcasted_mask);
std::vector<const phi::DenseTensor*> broadcast_ins = {&mask};
std::vector<phi::DenseTensor*> broadcast_outs = {&broadcasted_mask};
phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary,
uint8_t,
uint8_t>(dev_ctx,
broadcast_ins,
&broadcast_outs,
-1,
kps::IdentityFunctor<uint8_t>());
}
std::vector<const phi::DenseTensor*> ins = {
......@@ -449,12 +474,12 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
} else {
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
phi::funcs::ElementwiseKernel<T>(
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
}
} else {
MT factor = static_cast<MT>(1.0f);
phi::funcs::ElementwiseKernel<T>(
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
}
}
}
......
......@@ -45,8 +45,7 @@ void DropoutRawKernel(const Context& dev_ctx,
x,
seed_tensor.get_ptr(),
mask,
out,
false);
out);
}
template <typename T, typename Context>
......@@ -76,7 +75,8 @@ void DropoutNdKernel(const Context& dev_ctx,
seed_tensor.get_ptr(),
mask,
out,
true);
true,
axis);
}
} // namespace phi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册