未验证 提交 f5166284 编写于 作者: L limingshu 提交者: GitHub

Adjust warpper of gpu_lanuch_config (#38654)

* first commit

* fix wrong filename

* fix the wrong spell name

* fix gpu config warper

* modify according to pr advices

* fix GpuLauchConfig1D api bugs

* change the config for dropout grad

* fix bugs

* modification according to pr advices

* modification according to pr advices
上级 0d8d1e0e
...@@ -472,8 +472,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -472,8 +472,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
grid_sizes.gw = gw; grid_sizes.gw = gw;
grid_sizes.input_chans = input_chans; grid_sizes.input_chans = input_chans;
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D( platform::GpuLaunchConfig config =
ctx.cuda_device_context(), grid_count, 512); platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);
BilateralSliceCudaGridGradKernel< BilateralSliceCudaGridGradKernel<
T><<<config.block_per_grid, config.thread_per_block, 0, T><<<config.block_per_grid, config.thread_per_block, 0,
...@@ -481,8 +481,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -481,8 +481,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes, grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
has_offset, grid_count, output_chans); has_offset, grid_count, output_chans);
config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), config =
guide_count, 512); platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);
BilateralSliceCudaGuideGradKernel< BilateralSliceCudaGuideGradKernel<
T><<<config.block_per_grid, config.thread_per_block, 0, T><<<config.block_per_grid, config.thread_per_block, 0,
...@@ -490,8 +490,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -490,8 +490,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
guide_grad_data, output_grad_data, grid_data, guide_data, input_data, guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
grid_sizes, has_offset, guide_count, output_chans); grid_sizes, has_offset, guide_count, output_chans);
config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), config =
input_count, 512); platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);
BilateralSliceCudaInputGradKernel< BilateralSliceCudaInputGradKernel<
T><<<config.block_per_grid, config.thread_per_block, 0, T><<<config.block_per_grid, config.thread_per_block, 0,
......
...@@ -193,12 +193,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -193,12 +193,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
// VectorizedRandomGenerator use curand_uniform4, so we only support // VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4; // vec_size is 4;
int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1; int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size); auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
int grid_size =
((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
auto offset = auto offset =
((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size; ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
&seed_data, &increment); &seed_data, &increment);
...@@ -206,23 +203,25 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -206,23 +203,25 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
#ifdef __HIPCC__ #ifdef __HIPCC__
if (vec_size == 4 && size % 4 == 0) { if (vec_size == 4 && size % 4 == 0) {
hipLaunchKernelGGL( hipLaunchKernelGGL(
HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), grid_size, HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
block_size, 0, stream, size, seed_data, dropout_prob, x_data, gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
mask_data, y_data, upscale_in_train, increment); seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
increment);
} else { } else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>), hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
grid_size, block_size, 0, stream, size, seed_data, gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
dropout_prob, x_data, mask_data, y_data, stream, size, seed_data, dropout_prob, x_data,
upscale_in_train, increment); mask_data, y_data, upscale_in_train, increment);
} }
#else #else
if (vec_size == 4 && size % 4 == 0) { if (vec_size == 4 && size % 4 == 0) {
VectorizedRandomGenerator<T, uint8_t, VectorizedRandomGenerator<T, uint8_t, 4><<<
4><<<grid_size, block_size, 0, stream>>>( gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data, size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment); upscale_in_train, increment);
} else { } else {
RandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>( RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data, size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment); upscale_in_train, increment);
} }
...@@ -265,7 +264,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, ...@@ -265,7 +264,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob)); auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
platform::GpuLaunchConfig config = platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, size); platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
DropoutGradCUDAKernel< DropoutGradCUDAKernel<
T, uint8_t, T, uint8_t,
4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>( 4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
......
...@@ -128,10 +128,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx, ...@@ -128,10 +128,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
} else if (dx_data != dout_data && dy_data != dout_data) { } else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x->numel(); auto size = x->numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1); int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
dim3 grid_size = dim3 grid_size =
dim3(((size + vec_size - 1) / vec_size + ELEMENTWISE_BLOCK_SIZE - 1) / dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
ELEMENTWISE_BLOCK_SIZE, PREDEFINED_BLOCK_SIZE,
1); 1);
SimpleElemwiseAddGradCUDAKernel< SimpleElemwiseAddGradCUDAKernel<
T><<<grid_size, block_size, 0, T><<<grid_size, block_size, 0,
......
...@@ -73,10 +73,10 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx, ...@@ -73,10 +73,10 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
auto* dy_data = dy->mutable_data<T>(ctx.GetPlace()); auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout->dims()) { if (dy->dims() == dout->dims()) {
if (dy_data != dout_data) { if (dy_data != dout_data) {
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = dy->numel(); auto size = dy->numel();
dim3 grid_size = dim3( dim3 grid_size =
(size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1); dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<T><<< SimpleElemwiseSubGradCUDAKernel<T><<<
grid_size, block_size, 0, grid_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>( ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
...@@ -100,10 +100,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx, ...@@ -100,10 +100,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* out, const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx, const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) { framework::Tensor* dy) {
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1); dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
auto size = x->numel(); auto size = x->numel();
dim3 grid_size = dim3 grid_size =
dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1); dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel< SimpleElemwiseSubGradCUDAKernel<
T><<<grid_size, block_size, 0, T><<<grid_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>( ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
......
...@@ -23,6 +23,7 @@ namespace cub = hipcub; ...@@ -23,6 +23,7 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/index_sample_op.h" #include "paddle/fluid/operators/index_sample_op.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace paddle { namespace paddle {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/beam_search.h" #include "paddle/fluid/operators/math/beam_search.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,17 +16,10 @@ limitations under the License. */ ...@@ -16,17 +16,10 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/fast_divmod.h"
#ifdef __HIPCC__
#define POOLING_BLOCK_SIZE 256
#else
#define POOLING_BLOCK_SIZE 512
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -97,22 +90,6 @@ __device__ void OffsetPreparationFor4Dimension( ...@@ -97,22 +90,6 @@ __device__ void OffsetPreparationFor4Dimension(
} }
} }
int GetThreadsPerBlock(const platform::CUDADeviceContext& ctx,
int threads_per_block, int64_t numel) {
int sm_count = ctx.GetSMCount();
if (numel / (sm_count << 1) < threads_per_block) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about twice of SM, to acquire better performance.
threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 1));
} else if (numel / (sm_count << 2) < threads_per_block) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 2));
}
// Number of threads per block shall be larger than 64.
return std::max(64, threads_per_block);
}
template <typename PoolProcess, typename T> template <typename PoolProcess, typename T>
__global__ void KernelPool2D( __global__ void KernelPool2D(
const int nthreads, const T* input_data, const int channels, const int nthreads, const T* input_data, const int channels,
...@@ -491,14 +468,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> { ...@@ -491,14 +468,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width; int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
int grids = (nthreads + blocks - 1) / blocks;
auto pool_divmods = FastDivModForPoolingWithMoreStaff( auto pool_divmods = FastDivModForPoolingWithMoreStaff(
input_channels, input_width, input_height, ksize_width, ksize_height, input_channels, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height); stride_width, stride_height);
KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>( auto config = GetGpuLaunchConfig1D(context, nthreads);
KernelPool2DGrad<T, PoolProcess><<<
config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
nthreads, input_data, output_data, output_grad_data, output_width, nthreads, input_data, output_data, output_grad_data, output_width,
output_height, input_width, input_height, ksize_width, ksize_height, output_height, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height, padding_width, padding_height, stride_width, stride_height, padding_width, padding_height,
...@@ -541,14 +517,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> { ...@@ -541,14 +517,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width; int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
int grids = (nthreads + blocks - 1) / blocks;
auto pool_divmods = FastDivModForPoolingWithMoreStaff( auto pool_divmods = FastDivModForPoolingWithMoreStaff(
input_channels, input_width, input_height, ksize_width, ksize_height, input_channels, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height); stride_width, stride_height);
KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>( auto config = GetGpuLaunchConfig1D(context, nthreads);
KernelPool2DGrad<T, PoolProcess><<<
config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
nthreads, input_data, output_data, output_grad_data, output_width, nthreads, input_data, output_data, output_grad_data, output_width,
output_height, input_width, input_height, ksize_width, ksize_height, output_height, input_width, input_height, ksize_width, ksize_height,
stride_width, stride_height, padding_width, padding_height, stride_width, stride_height, padding_width, padding_height,
......
...@@ -26,22 +26,6 @@ namespace platform { ...@@ -26,22 +26,6 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) \ #define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate)) mask = __ballot_sync(FULL_WARP_MASK, (predicate))
inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) {
return 1024;
} else if (dim > 256) {
return 512;
} else if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ #define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \
case (dim): { \ case (dim): { \
constexpr auto kPowerOfTwoDim = (dim); \ constexpr auto kPowerOfTwoDim = (dim); \
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// Used for compute gpu launch parameter // Used for compute gpu launch parameter config
#pragma once #pragma once
...@@ -30,11 +30,36 @@ ...@@ -30,11 +30,36 @@
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef __HIPCC__
// HIP results in error or nan if > 256
#define PREDEFINED_BLOCK_SIZE 256
#else
/* CUDA performs better as thread_per_block
num is between [64, 512] */
#define PREDEFINED_BLOCK_SIZE 512
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
inline int DivUp(int a, int b) { return (a + b - 1) / b; } inline int DivUp(int a, int b) { return (a + b - 1) / b; }
/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
for round integer value into next highest power of 2. */
static inline int RoundToPowerOfTwo(int n) {
n--;
n |= (n >> 1);
n |= (n >> 2);
n |= (n >> 4);
n |= (n >> 8);
n |= (n >> 16);
#ifdef __HIPCC__
return std::min(256, std::max(32, (n + 1)));
#else
return std::min(1024, std::max(32, (n + 1)));
#endif
}
#ifdef WITH_NV_JETSON #ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device // The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 . // is nano or tx2 .
...@@ -48,54 +73,64 @@ inline void ChangeThreadNum(const platform::CUDADeviceContext& context, ...@@ -48,54 +73,64 @@ inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
#endif #endif
struct GpuLaunchConfig { struct GpuLaunchConfig {
dim3 theory_thread_count = dim3(1, 1, 1); public:
GpuLaunchConfig() {}
size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
size_t GetGridSize() const {
return block_per_grid.x * block_per_grid.y * block_per_grid.z;
}
size_t GetBlockSize() const {
return thread_per_block.x * thread_per_block.y * thread_per_block.z;
}
int compute_capability = 0;
dim3 thread_per_block = dim3(1, 1, 1); dim3 thread_per_block = dim3(1, 1, 1);
dim3 block_per_grid = dim3(1, 1, 1); dim3 block_per_grid = dim3(1, 1, 1);
int compute_capability = 0;
}; };
/* According to NVIDIA, if number of threads per block is 64/128/256/512,
* cuda performs better. And number of blocks should be greater (at least
* 2x~4x) than number of SMs. Hence, SM count is took into account within
* this function to determine the right number of threads per block. */
inline GpuLaunchConfig GetGpuLaunchConfig1D( inline GpuLaunchConfig GetGpuLaunchConfig1D(
const platform::CUDADeviceContext& context, int64_t element_count, const platform::CUDADeviceContext& context, int64_t numel,
#ifdef PADDLE_WITH_HIP int vec_size = 1) {
// HIP will throw GPU memory access fault if threads > 256 PADDLE_ENFORCE_GT(numel, 0, platform::errors::InvalidArgument(
int max_threads = 256) { "element quantity should be greater than 0,"
#else " but received value is: %d.",
int max_threads = 1024) { numel));
#endif
PADDLE_ENFORCE_GT(element_count, 0,
platform::errors::InvalidArgument(
"element count should be greater than 0,"
" but received value is: %d.",
element_count));
const int theory_thread_count = element_count;
// Get Max threads in all SM
int max_physical_threads = context.GetMaxPhysicalThreadCount();
int sm = context.GetSMCount();
// Compute physical threads we need, should small than max sm threads
const int physical_thread_count =
(std::min)(max_physical_threads, theory_thread_count);
// Get compute_capability // Get compute_capability
const int capability = context.GetComputeCapability(); const int capability = context.GetComputeCapability();
/* If thread number per block is 64/128/256/512, cuda performs better.*/
int limit_threads =
std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
#ifdef WITH_NV_JETSON #ifdef WITH_NV_JETSON
if (capability == 53 || capability == 62) { if (capability == 53 || capability == 62) {
max_threads = 512; limit_threads = 512;
} }
#endif #endif
int threads = limit_threads;
// Need get from device int sm_count = context.GetSMCount();
const int thread_per_block = int active_threads_num = numel / vec_size;
(std::min)(max_threads, context.GetMaxThreadsPerBlock()); if (active_threads_num / (sm_count << 1) < limit_threads) {
const int block_count = // Round up threads number into an exponential multiple of 2, while number
(std::min)(DivUp(physical_thread_count, thread_per_block), sm); // of acitve blocks is about twice of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
} else if (active_threads_num / (sm_count << 2) < limit_threads) {
// Round up threads number into an exponential multiple of 2, while number
// of acitve blocks is about 4 times of SM, to acquire better performance.
threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
}
// Number of threads per block shall be larger than 64.
threads = std::max(64, threads);
int blocks = DivUp(DivUp(numel, vec_size), threads);
GpuLaunchConfig config; GpuLaunchConfig config;
config.theory_thread_count.x = theory_thread_count; config.thread_per_block.x = threads;
config.thread_per_block.x = thread_per_block; config.block_per_grid.x = blocks;
config.block_per_grid.x = block_count;
config.compute_capability = capability; config.compute_capability = capability;
return config; return config;
} }
...@@ -120,7 +155,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D( ...@@ -120,7 +155,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
GpuLaunchConfig config; GpuLaunchConfig config;
// Noticed, block size is not align to 32, if needed do it yourself. // Noticed, block size is not align to 32, if needed do it yourself.
config.theory_thread_count = dim3(x_dim, y_dim, 1);
config.thread_per_block = dim3(block_cols, block_rows, 1); config.thread_per_block = dim3(block_cols, block_rows, 1);
int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks); int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
......
...@@ -24,19 +24,6 @@ namespace platform { ...@@ -24,19 +24,6 @@ namespace platform {
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
inline static int RoundToPowerOfTwo(int dim) {
// HIP results in error or nan if > 256
if (dim > 128) {
return 256;
} else if (dim > 64) {
return 128;
} else if (dim > 32) {
return 64;
} else {
return 32;
}
}
#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ #define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \
case (dim): { \ case (dim): { \
constexpr auto kPowerOfTwoDim = (dim); \ constexpr auto kPowerOfTwoDim = (dim); \
......
...@@ -16,9 +16,9 @@ limitations under the License. */ ...@@ -16,9 +16,9 @@ limitations under the License. */
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h" #include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
namespace pten { namespace pten {
...@@ -239,18 +239,15 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -239,18 +239,15 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
VecSize><<<grid_size, block_size, 0, stream>>>( VecSize><<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func); ins_data, outs_data, numel, main_offset, func);
#else #else
int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize); auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
int grid_size = int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
((numel + VecSize - 1) / VecSize + block_size - 1) / block_size; gpu_config.GetBlockSize();
int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
auto stream = ctx.stream(); auto stream = ctx.stream();
VectorizedElementwiseKernel<InT, VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
OutT, gpu_config.block_per_grid,
Functor, gpu_config.thread_per_block,
Arity, 0,
NumOuts, stream>>>(ins_data, outs_data, numel, main_offset, func);
VecSize><<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func);
#endif #endif
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册