未验证 提交 f5166284 编写于 作者: L limingshu 提交者: GitHub

Adjust warpper of gpu_lanuch_config (#38654)

* first commit

* fix wrong filename

* fix the wrong spell name

* fix gpu config warper

* modify according to pr advices

* fix GpuLauchConfig1D api bugs

* change the config for dropout grad

* fix bugs

* modification according to pr advices

* modification according to pr advices
上级 0d8d1e0e
......@@ -472,8 +472,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
grid_sizes.gw = gw;
grid_sizes.input_chans = input_chans;
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(
ctx.cuda_device_context(), grid_count, 512);
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
BilateralSliceCudaGridGradKernel<
T><<<config.block_per_grid, config.thread_per_block, 0,
......@@ -481,8 +481,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
has_offset, grid_count, output_chans);
config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(),
guide_count, 512);
config =
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
BilateralSliceCudaGuideGradKernel<
T><<<config.block_per_grid, config.thread_per_block, 0,
......@@ -490,8 +490,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
grid_sizes, has_offset, guide_count, output_chans);
config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(),
input_count, 512);
config =
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
BilateralSliceCudaInputGradKernel<
T><<<config.block_per_grid, config.thread_per_block, 0,
......
......@@ -193,12 +193,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
// VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4;
int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
int grid_size =
((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
auto offset =
((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size;
((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
&seed_data, &increment);
......@@ -206,23 +203,25 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
#ifdef __HIPCC__
if (vec_size == 4 && size % 4 == 0) {
hipLaunchKernelGGL(
HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), grid_size,
block_size, 0, stream, size, seed_data, dropout_prob, x_data,
mask_data, y_data, upscale_in_train, increment);
HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
increment);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
grid_size, block_size, 0, stream, size, seed_data,
dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
stream, size, seed_data, dropout_prob, x_data,
mask_data, y_data, upscale_in_train, increment);
}
#else
if (vec_size == 4 && size % 4 == 0) {
VectorizedRandomGenerator<T, uint8_t,
4><<<grid_size, block_size, 0, stream>>>(
VectorizedRandomGenerator<T, uint8_t, 4><<<
gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
} else {
RandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
}
......@@ -265,7 +264,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
auto stream = dev_ctx.stream();
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, size);
platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
DropoutGradCUDAKernel<
T, uint8_t,
4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
......
......@@ -128,10 +128,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
} else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x->numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
dim3 grid_size =
dim3(((size + vec_size - 1) / vec_size + ELEMENTWISE_BLOCK_SIZE - 1) /
ELEMENTWISE_BLOCK_SIZE,
dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
PREDEFINED_BLOCK_SIZE,
1);
SimpleElemwiseAddGradCUDAKernel<
T><<<grid_size, block_size, 0,
......
......@@ -73,10 +73,10 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout->dims()) {
if (dy_data != dout_data) {
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = dy->numel();
dim3 grid_size = dim3(
(size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
dim3 grid_size =
dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<T><<<
grid_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
......@@ -100,10 +100,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = x->numel();
dim3 grid_size =
dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<
T><<<grid_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
......
......@@ -23,6 +23,7 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace paddle {
namespace operators {
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle {
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace paddle {
namespace operators {
......
......@@ -16,17 +16,10 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h"
#ifdef __HIPCC__
#define POOLING_BLOCK_SIZE 256
#else
#define POOLING_BLOCK_SIZE 512
#endif
namespace paddle {
namespace operators {
namespace math {
......@@ -97,22 +90,6 @@ __device__ void OffsetPreparationFor4Dimension(
}
}
int GetThreadsPerBlock(const platform::CUDADeviceContext& ctx,
int threads_per_block, int64_t numel) {
int sm_count = ctx.GetSMCount();
if (numel / (sm_count << 1) < threads_per_block) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 1));
} else if (numel / (sm_count << 2) < threads_per_block) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 2));
}
// Number of threads per block shall be larger than 64.
return std::max(64, threads_per_block);
}
template <typename PoolProcess, typename T>
__global__ void KernelPool2D(
const int nthreads, const T* input_data, const int channels,
......@@ -491,14 +468,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
int grids = (nthreads + blocks - 1) / blocks;
auto pool_divmods = FastDivModForPoolingWithMoreStaff(
input_channels, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height);
KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
auto config = GetGpuLaunchConfig1D(context, nthreads);
KernelPool2DGrad<T, PoolProcess><<<
config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
nthreads, input_data, output_data, output_grad_data, output_width,
output_height, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height, padding_width, padding_height,
......@@ -541,14 +517,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
int grids = (nthreads + blocks - 1) / blocks;
auto pool_divmods = FastDivModForPoolingWithMoreStaff(
input_channels, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height);
KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
auto config = GetGpuLaunchConfig1D(context, nthreads);
KernelPool2DGrad<T, PoolProcess><<<
config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
nthreads, input_data, output_data, output_grad_data, output_width,
output_height, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height, padding_width, padding_height,
......
......@@ -26,22 +26,6 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) {
return 1024;
} else if (dim > 256) {
return 512;
} else if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \
case (dim): { \
constexpr auto kPowerOfTwoDim = (dim); \
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Used for compute gpu launch parameter
// Used for compute gpu launch parameter config
#pragma once
......@@ -30,11 +30,36 @@
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace paddle {
namespace platform {
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
static inline int RoundToPowerOfTwo(int n) {
n--;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
#else
return std::min(1024, std::max(32, (n + 1)));
#endif
}
#ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 .
......@@ -48,54 +73,64 @@ inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
#endif
struct GpuLaunchConfig {
dim3 theory_thread_count = dim3(1, 1, 1);
public:
GpuLaunchConfig() {}
size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
size_t GetGridSize() const {
return block_per_grid.x * block_per_grid.y * block_per_grid.z;
}
size_t GetBlockSize() const {
return thread_per_block.x * thread_per_block.y * thread_per_block.z;
}
int compute_capability = 0;
dim3 thread_per_block = dim3(1, 1, 1);
dim3 block_per_grid = dim3(1, 1, 1);
int compute_capability = 0;
};
/* According to NVIDIA, if number of threads per block is 64/128/256/512,
* cuda performs better. And number of blocks should be greater (at least
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block. */
inline GpuLaunchConfig GetGpuLaunchConfig1D(
const platform::CUDADeviceContext& context, int64_t element_count,
#ifdef PADDLE_WITH_HIP
// HIP will throw GPU memory access fault if threads > 256
int max_threads = 256) {
#else
int max_threads = 1024) {
#endif
PADDLE_ENFORCE_GT(element_count, 0,
platform::errors::InvalidArgument(
"element count should be greater than 0,"
const platform::CUDADeviceContext& context, int64_t numel,
int vec_size = 1) {
PADDLE_ENFORCE_GT(numel, 0, platform::errors::InvalidArgument(
"element quantity should be greater than 0,"
" but received value is: %d.",
element_count));
const int theory_thread_count = element_count;
// Get Max threads in all SM
int max_physical_threads = context.GetMaxPhysicalThreadCount();
int sm = context.GetSMCount();
// Compute physical threads we need, should small than max sm threads
const int physical_thread_count =
(std::min)(max_physical_threads, theory_thread_count);
numel));
// Get compute_capability
const int capability = context.GetComputeCapability();
/* If thread number per block is 64/128/256/512, cuda performs better.*/
int limit_threads =
std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
#ifdef WITH_NV_JETSON
if (capability == 53 || capability == 62) {
max_threads = 512;
limit_threads = 512;
}
#endif
// Need get from device
const int thread_per_block =
(std::min)(max_threads, context.GetMaxThreadsPerBlock());
const int block_count =
(std::min)(DivUp(physical_thread_count, thread_per_block), sm);
int threads = limit_threads;
int sm_count = context.GetSMCount();
int active_threads_num = numel / vec_size;
if (active_threads_num / (sm_count << 1) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
} else if (active_threads_num / (sm_count << 2) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
}
// Number of threads per block shall be larger than 64.
threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads);
GpuLaunchConfig config;
config.theory_thread_count.x = theory_thread_count;
config.thread_per_block.x = thread_per_block;
config.block_per_grid.x = block_count;
config.thread_per_block.x = threads;
config.block_per_grid.x = blocks;
config.compute_capability = capability;
return config;
}
......@@ -120,7 +155,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself.
config.theory_thread_count = dim3(x_dim, y_dim, 1);
config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
......
......@@ -24,19 +24,6 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
inline static int RoundToPowerOfTwo(int dim) {
// HIP results in error or nan if > 256
if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \
case (dim): { \
constexpr auto kPowerOfTwoDim = (dim); \
......
......@@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
namespace pten {
......@@ -239,18 +239,15 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
VecSize><<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func);
#else
int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
int grid_size =
((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
gpu_config.GetBlockSize();
auto stream = ctx.stream();
VectorizedElementwiseKernel<InT,
OutT,
Functor,
Arity,
NumOuts,
VecSize><<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func);
VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
gpu_config.block_per_grid,
gpu_config.thread_per_block,
0,
stream>>>(ins_data, outs_data, numel, main_offset, func);
#endif
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册