未验证 提交 0b54d54f 编写于 作者: C Chen Weihang 提交者: GitHub

Fix index overflow bug of the CUDA kernel loop increment (#25435)

* fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop

* replace old macro & for condition, test=develop

* polish details, test=develop
上级 0b2ec49f
...@@ -23,9 +23,6 @@ ...@@ -23,9 +23,6 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <size_t EMBEDX_DIM, size_t EXPAND_EMBED_DIM> template <size_t EMBEDX_DIM, size_t EXPAND_EMBED_DIM>
__global__ void PullCopy( __global__ void PullCopy(
......
...@@ -22,10 +22,7 @@ ...@@ -22,10 +22,7 @@
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
__global__ void test(size_t* a, int size) { __global__ void test(size_t* a, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; CUDA_KERNEL_LOOP(i, size) { a[i] *= 2; }
i += blockDim.x * gridDim.x) {
a[i] *= 2;
}
} }
TEST(LoD, data) { TEST(LoD, data) {
......
...@@ -24,10 +24,6 @@ namespace paddle { ...@@ -24,10 +24,6 @@ namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024; const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) { static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
......
...@@ -24,14 +24,10 @@ namespace operators { ...@@ -24,14 +24,10 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void GPUBCELossForward(const T* x_data, const T* label_data, __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
T* out_data, const int in_numel) { T* out_data, const int in_numel) {
CUDA_1D_KERNEL_LOOP(i, in_numel) { CUDA_KERNEL_LOOP(i, in_numel) {
T x = x_data[i]; T x = x_data[i];
T label = label_data[i]; T label = label_data[i];
T one = static_cast<T>(1.); T one = static_cast<T>(1.);
...@@ -48,7 +44,7 @@ template <typename T> ...@@ -48,7 +44,7 @@ template <typename T>
__global__ void GPUBCELossBackward(const T* x_data, const T* label_data, __global__ void GPUBCELossBackward(const T* x_data, const T* label_data,
const T* dout_data, T* dx_data, const T* dout_data, T* dx_data,
const int in_numel) { const int in_numel) {
CUDA_1D_KERNEL_LOOP(i, in_numel) { CUDA_KERNEL_LOOP(i, in_numel) {
T x = x_data[i]; T x = x_data[i];
T label = label_data[i]; T label = label_data[i];
T dout = dout_data[i]; T dout = dout_data[i];
......
...@@ -25,10 +25,6 @@ using platform::PADDLE_CUDA_NUM_THREADS; ...@@ -25,10 +25,6 @@ using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width, __global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width,
const T* X, T* Y, int64_t numel) { const T* X, T* Y, int64_t numel) {
......
...@@ -30,10 +30,6 @@ using LoDTensor = framework::LoDTensor; ...@@ -30,10 +30,6 @@ using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout; using DataLayout = framework::DataLayout;
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
inline int GET_BLOCKS(const int N) { inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
} }
......
...@@ -40,10 +40,6 @@ namespace operators { ...@@ -40,10 +40,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024; const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) { static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
......
...@@ -24,8 +24,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num, ...@@ -24,8 +24,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
const int width, const T offset) { const int width, const T offset) {
int num_anchors = as_num * ar_num; int num_anchors = as_num * ar_num;
int box_num = height * width * num_anchors; int box_num = height * width * num_anchors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; CUDA_KERNEL_LOOP(i, box_num) {
i += blockDim.x * gridDim.x) {
int h_idx = i / (num_anchors * width); int h_idx = i / (num_anchors * width);
int w_idx = (i / num_anchors) % width; int w_idx = (i / num_anchors) % width;
T stride_width = stride[0]; T stride_width = stride[0];
...@@ -64,10 +63,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num, ...@@ -64,10 +63,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
template <typename T> template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum, __global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) { const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
} }
template <typename T> template <typename T>
......
...@@ -40,8 +40,7 @@ static inline int NumBlocks(const int N) { ...@@ -40,8 +40,7 @@ static inline int NumBlocks(const int N) {
static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids, static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids,
int* length_lod) { int* length_lod) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (nthreads); CUDA_KERNEL_LOOP(i, nthreads) {
i += blockDim.x * gridDim.x) {
platform::CudaAtomicAdd(length_lod + batch_ids[i], 1); platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
} }
} }
......
...@@ -31,10 +31,6 @@ using LoDTensor = framework::LoDTensor; ...@@ -31,10 +31,6 @@ using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 64; static constexpr int kNumCUDAThreads = 64;
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int kNumMaxinumNumBlocks = 4096;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const BBoxSize = 4; int const BBoxSize = 4;
static inline int NumBlocks(const int N) { static inline int NumBlocks(const int N) {
...@@ -48,7 +44,7 @@ __global__ void GPUDistFpnProposalsHelper( ...@@ -48,7 +44,7 @@ __global__ void GPUDistFpnProposalsHelper(
const int refer_level, const int refer_scale, const int max_level, const int refer_level, const int refer_scale, const int max_level,
const int min_level, int* roi_batch_id_data, int* sub_lod_list, const int min_level, int* roi_batch_id_data, int* sub_lod_list,
int* target_lvls) { int* target_lvls) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_KERNEL_LOOP(i, nthreads) {
const T* offset_roi = rois + i * BBoxSize; const T* offset_roi = rois + i * BBoxSize;
int roi_batch_ind = roi_batch_id_data[i]; int roi_batch_ind = roi_batch_id_data[i];
// get the target level of current rois // get the target level of current rois
......
...@@ -33,9 +33,6 @@ using LoDTensor = framework::LoDTensor; ...@@ -33,9 +33,6 @@ using LoDTensor = framework::LoDTensor;
namespace { namespace {
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const kThreadsPerBlock = sizeof(uint64_t) * 8; int const kThreadsPerBlock = sizeof(uint64_t) * 8;
...@@ -155,7 +152,7 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, ...@@ -155,7 +152,7 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
int cnt = 0; int cnt = 0;
__shared__ int keep_index[BlockSize]; __shared__ int keep_index[BlockSize];
CUDA_1D_KERNEL_LOOP(i, num) { CUDA_KERNEL_LOOP(i, num) {
keep_index[threadIdx.x] = -1; keep_index[threadIdx.x] = -1;
__syncthreads(); __syncthreads();
......
...@@ -32,8 +32,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, ...@@ -32,8 +32,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
bool min_max_aspect_ratios_order) { bool min_max_aspect_ratios_order) {
int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
int box_num = height * width * num_priors; int box_num = height * width * num_priors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; CUDA_KERNEL_LOOP(i, box_num) {
i += blockDim.x * gridDim.x) {
int h = i / (num_priors * width); int h = i / (num_priors * width);
int w = (i / num_priors) % width; int w = (i / num_priors) % width;
int p = i % num_priors; int p = i % num_priors;
...@@ -87,10 +86,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, ...@@ -87,10 +86,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
template <typename T> template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum, __global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) { const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
} }
template <typename T> template <typename T>
......
...@@ -30,10 +30,6 @@ namespace operators { ...@@ -30,10 +30,6 @@ namespace operators {
#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2) #define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1) #define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__device__ bool GT_E(T a, T b) { __device__ bool GT_E(T a, T b) {
return (a > b) || Eigen::numext::abs(a - b) < 1e-4; return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
...@@ -284,7 +280,7 @@ __global__ void RoiTransformKernel(const float* input_data, ...@@ -284,7 +280,7 @@ __global__ void RoiTransformKernel(const float* input_data,
int* mask, T* transform_matrix) { int* mask, T* transform_matrix) {
int output_size = int output_size =
num_rois * transformed_height * transformed_width * channels; num_rois * transformed_height * transformed_width * channels;
CUDA_1D_KERNEL_LOOP(index, output_size) { CUDA_KERNEL_LOOP(index, output_size) {
// (n, c, out_h, out_w) is an element in the transformed output // (n, c, out_h, out_w) is an element in the transformed output
int out_w = idx4_4(index, num_rois, channels, transformed_height, int out_w = idx4_4(index, num_rois, channels, transformed_height,
transformed_width); transformed_width);
...@@ -463,7 +459,7 @@ __global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data, ...@@ -463,7 +459,7 @@ __global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data,
const T* out2in_w_data, const T* out2in_w_data,
const T* out_grad_data, const T* out_grad_data,
T* in_grad_data) { T* in_grad_data) {
CUDA_1D_KERNEL_LOOP(index, out_size * 4) { CUDA_KERNEL_LOOP(index, out_size * 4) {
int in_idx = out2in_idx_data[index]; int in_idx = out2in_idx_data[index];
if (in_idx >= 0) { if (in_idx >= 0) {
int out_idx = index / 4; int out_idx = index / 4;
......
...@@ -30,10 +30,6 @@ static inline int NumBlocks(const int N) { ...@@ -30,10 +30,6 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks); kNumMaxinumNumBlocks);
} }
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void GPUSigmoidFocalLossForward(const T *x_data, __global__ void GPUSigmoidFocalLossForward(const T *x_data,
const int *label_data, const int *label_data,
...@@ -41,7 +37,7 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data, ...@@ -41,7 +37,7 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data,
const T gamma, const T alpha, const T gamma, const T alpha,
const int num_classes, const int num_classes,
const int limit, T *out_data) { const int limit, T *out_data) {
CUDA_1D_KERNEL_LOOP(i, limit) { CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i]; T x = x_data[i];
int a = i / num_classes; // current sample int a = i / num_classes; // current sample
int d = i % num_classes; // current class int d = i % num_classes; // current class
...@@ -79,7 +75,7 @@ __global__ void GPUSigmoidFocalLossBackward( ...@@ -79,7 +75,7 @@ __global__ void GPUSigmoidFocalLossBackward(
const T *x_data, const int *label_data, const int *fg_num_data, const T *x_data, const int *label_data, const int *fg_num_data,
const T gamma, const T alpha, const int num_classes, const T *dout_data, const T gamma, const T alpha, const int num_classes, const T *dout_data,
const int limit, T *dx_data) { const int limit, T *dx_data) {
CUDA_1D_KERNEL_LOOP(i, limit) { CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i]; T x = x_data[i];
T dout = dout_data[i]; T dout = dout_data[i];
......
...@@ -27,15 +27,11 @@ namespace operators { ...@@ -27,15 +27,11 @@ namespace operators {
using framework::Tensor; using framework::Tensor;
using platform::DeviceContext; using platform::DeviceContext;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T, typename IndexT = int> template <typename T, typename IndexT = int>
__global__ void GatherCUDAKernel(const T* params, const IndexT* indices, __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
T* output, size_t index_size, T* output, size_t index_size,
size_t slice_size) { size_t slice_size) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { CUDA_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size; int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = indices[indices_i]; IndexT gather_i = indices[indices_i];
...@@ -49,7 +45,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims, ...@@ -49,7 +45,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
const IndexT* indices, T* output, const IndexT* indices, T* output,
size_t remain_size, size_t slice_size, size_t remain_size, size_t slice_size,
size_t end_size) { size_t end_size) {
CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) { CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
int indices_i = i / slice_size; int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = 0; IndexT gather_i = 0;
......
...@@ -19,15 +19,11 @@ limitations under the License. */ ...@@ -19,15 +19,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void GatherTree(const T *ids_data, const T *parents_data, __global__ void GatherTree(const T *ids_data, const T *parents_data,
T *out_data, const int64_t max_length, T *out_data, const int64_t max_length,
const int64_t batch_size, const int64_t beam_size) { const int64_t batch_size, const int64_t beam_size) {
CUDA_1D_KERNEL_LOOP(i, batch_size * beam_size) { CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
int batch = i / beam_size; int batch = i / beam_size;
int beam = i % beam_size; int beam = i % beam_size;
auto idx = auto idx =
......
...@@ -27,10 +27,6 @@ using IndexType = int64_t; ...@@ -27,10 +27,6 @@ using IndexType = int64_t;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
inline int GET_BLOCKS(const int N) { inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
} }
......
...@@ -35,8 +35,7 @@ using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType; ...@@ -35,8 +35,7 @@ using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T> template <typename T>
static __global__ void repeat_param(const T *input, T *output, static __global__ void repeat_param(const T *input, T *output,
const int repeat_num, const int C) { const int repeat_num, const int C) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C; CUDA_KERNEL_LOOP(i, repeat_num * C) {
i += blockDim.x * gridDim.x) {
int index = i % C; int index = i % C;
output[i] = input[index]; output[i] = input[index];
} }
......
...@@ -19,13 +19,9 @@ limitations under the License. */ ...@@ -19,13 +19,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) { __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; } CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
} }
template <typename T> template <typename T>
......
...@@ -24,10 +24,6 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c ...@@ -24,10 +24,6 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename Dtype> template <typename Dtype>
__device__ Dtype cuda_sigmoid(const Dtype x) { __device__ Dtype cuda_sigmoid(const Dtype x) {
return Dtype(1) / (Dtype(1) + exp(-x)); return Dtype(1) / (Dtype(1) + exp(-x));
...@@ -42,7 +38,7 @@ template <typename T> ...@@ -42,7 +38,7 @@ template <typename T>
__global__ void LSTMUnitKernel(const int nthreads, const int dim, __global__ void LSTMUnitKernel(const int nthreads, const int dim,
const T* C_prev, const T* X, T* C, T* H, const T* C_prev, const T* X, T* C, T* H,
const T forget_bias) { const T forget_bias) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / dim; const int n = index / dim;
const int d = index % dim; const int d = index % dim;
...@@ -65,7 +61,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim, ...@@ -65,7 +61,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
const T* C_diff, const T* H_diff, const T* C_diff, const T* H_diff,
T* C_prev_diff, T* X_diff, T* C_prev_diff, T* X_diff,
const T forget_bias) { const T forget_bias) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / dim; const int n = index / dim;
const int d = index % dim; const int d = index % dim;
const T* X_offset = X + 4 * dim * n; const T* X_offset = X + 4 * dim * n;
......
...@@ -25,8 +25,7 @@ template <typename T> ...@@ -25,8 +25,7 @@ template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D, const int N, const int D,
const int ignore_index) { const int ignore_index) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; CUDA_KERNEL_LOOP(i, N) {
i += blockDim.x * gridDim.x) {
PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index, PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
"label[%d] expected >= 0 and < %ld, or == %ld, but got " "label[%d] expected >= 0 and < %ld, or == %ld, but got "
"%ld. Please check input value.", "%ld. Please check input value.",
......
...@@ -75,8 +75,7 @@ template <typename T> ...@@ -75,8 +75,7 @@ template <typename T>
__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width, __global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
int num) { int num) {
T tmp = 1.0 / width; T tmp = 1.0 / width;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; CUDA_KERNEL_LOOP(i, num) {
i += blockDim.x * gridDim.x) {
int h = i * tmp; int h = i * tmp;
int w = i - h * width; int w = i - h * width;
c[i] = a[i] + b[w]; c[i] = a[i] + b[w];
......
...@@ -23,10 +23,6 @@ namespace operators { ...@@ -23,10 +23,6 @@ namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void CountCUDAKernel(const int num_classes, const int count, __global__ void CountCUDAKernel(const int num_classes, const int count,
const T* predictions, const T* labels, const T* predictions, const T* labels,
...@@ -42,7 +38,7 @@ __global__ void CountCUDAKernel(const int num_classes, const int count, ...@@ -42,7 +38,7 @@ __global__ void CountCUDAKernel(const int num_classes, const int count,
T pred; T pred;
T label; T label;
CUDA_1D_KERNEL_LOOP(i, count) { CUDA_KERNEL_LOOP(i, count) {
pred = predictions[i]; pred = predictions[i];
label = labels[i]; label = labels[i];
if (pred == label) { if (pred == label) {
...@@ -68,7 +64,7 @@ __global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong, ...@@ -68,7 +64,7 @@ __global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
valid_count_c = 0; valid_count_c = 0;
} }
__syncthreads(); __syncthreads();
CUDA_1D_KERNEL_LOOP(i, num_classes) { CUDA_KERNEL_LOOP(i, num_classes) {
int wrong_n = wrong[i]; int wrong_n = wrong[i];
int correct_n = correct[i]; int correct_n = correct[i];
int denominator = wrong_n + correct_n; int denominator = wrong_n + correct_n;
......
...@@ -23,9 +23,6 @@ namespace operators { ...@@ -23,9 +23,6 @@ namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, __global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
const int bucket_length, const int bucket_length,
......
...@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) { ...@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks); kNumMaxinumNumBlocks);
} }
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data, __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
const int64_t* label_data, const int64_t* label_data,
...@@ -42,7 +38,7 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data, ...@@ -42,7 +38,7 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
const int64_t batch_size, const int64_t batch_size,
const int64_t n_classes, const int64_t n_classes,
const int64_t ignore_index) { const int64_t ignore_index) {
CUDA_1D_KERNEL_LOOP(i, batch_size) { CUDA_KERNEL_LOOP(i, batch_size) {
const int64_t cur_label = label_data[i]; const int64_t cur_label = label_data[i];
if (cur_label == ignore_index) { if (cur_label == ignore_index) {
out_data[i] = 0; out_data[i] = 0;
...@@ -191,7 +187,7 @@ __global__ void GPUNLLLossForward2D_no_reduce( ...@@ -191,7 +187,7 @@ __global__ void GPUNLLLossForward2D_no_reduce(
const int64_t map_size = in_dim2 * in_dim3; const int64_t map_size = in_dim2 * in_dim3;
const int64_t sample_size = n_classes * map_size; const int64_t sample_size = n_classes * map_size;
const int64_t out_numel = batch_size * map_size; const int64_t out_numel = batch_size * map_size;
CUDA_1D_KERNEL_LOOP(i, out_numel) { CUDA_KERNEL_LOOP(i, out_numel) {
const int64_t b = i % batch_size; const int64_t b = i % batch_size;
const int64_t h = (i / batch_size) % in_dim2; const int64_t h = (i / batch_size) % in_dim2;
const int64_t w = (i / (batch_size * in_dim2)) % in_dim3; const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
...@@ -261,7 +257,7 @@ __global__ void GPUNLLLossBackward1D_no_reduce( ...@@ -261,7 +257,7 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
T* dx_data, const int64_t* label_data, const T* weight_data, T* dx_data, const int64_t* label_data, const T* weight_data,
const T* dout_data, const int64_t batch_size, const int64_t n_classes, const T* dout_data, const int64_t batch_size, const int64_t n_classes,
const int64_t ignore_index) { const int64_t ignore_index) {
CUDA_1D_KERNEL_LOOP(i, batch_size) { CUDA_KERNEL_LOOP(i, batch_size) {
const int64_t cur_label = label_data[i]; const int64_t cur_label = label_data[i];
if (cur_label == ignore_index) { if (cur_label == ignore_index) {
continue; continue;
...@@ -299,7 +295,7 @@ __global__ void GPUNLLLossBackward2D_no_reduce( ...@@ -299,7 +295,7 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
const int64_t map_size = in_dim2 * in_dim3; const int64_t map_size = in_dim2 * in_dim3;
const int64_t sample_size = n_classes * map_size; const int64_t sample_size = n_classes * map_size;
const int64_t out_numel = batch_size * map_size; const int64_t out_numel = batch_size * map_size;
CUDA_1D_KERNEL_LOOP(i, out_numel) { CUDA_KERNEL_LOOP(i, out_numel) {
const int64_t b = i % batch_size; const int64_t b = i % batch_size;
const int64_t h = (i / batch_size) % in_dim2; const int64_t h = (i / batch_size) % in_dim2;
const int64_t w = (i / (batch_size * in_dim2)) % in_dim3; const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
......
...@@ -26,8 +26,7 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, ...@@ -26,8 +26,7 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* g_norm, T* p_out, T* v_out) { const T* g_norm, T* p_out, T* v_out) {
T lr = learning_rate[0]; T lr = learning_rate[0];
T local_lr = learning_rate[0]; T local_lr = learning_rate[0];
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; CUDA_KERNEL_LOOP(i, num) {
i += blockDim.x * gridDim.x) {
if (p_norm[0] > 0 && g_norm[0] > 0) { if (p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] / local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]); (g_norm[0] + lars_weight_decay * p_norm[0]);
......
...@@ -25,8 +25,7 @@ template <typename T> ...@@ -25,8 +25,7 @@ template <typename T>
__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
const int num, T* p_out) { const int num, T* p_out) {
T lr = learning_rate[0]; T lr = learning_rate[0];
int grid_size = blockDim.x * gridDim.x; CUDA_KERNEL_LOOP(i, num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
T g_data = g[i]; T g_data = g[i];
T p_data = p[i]; T p_data = p[i];
p_out[i] = p_data - lr * g_data; p_out[i] = p_data - lr * g_data;
......
...@@ -23,10 +23,6 @@ namespace operators { ...@@ -23,10 +23,6 @@ namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS; using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
using framework::Tensor; using framework::Tensor;
template <typename T> template <typename T>
...@@ -36,7 +32,7 @@ __global__ void Pad2DConstNCHW(const int nthreads, const T* in_data, ...@@ -36,7 +32,7 @@ __global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, T value, const int pad_top, const int pad_left, T value,
T* out_data) { T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width; int nc = index / out_width;
const int out_w = index % out_width; const int out_w = index % out_width;
const int out_h = nc % out_height; const int out_h = nc % out_height;
...@@ -57,7 +53,7 @@ __global__ void Pad2DConstNHWC(const int nthreads, const T* in_data, ...@@ -57,7 +53,7 @@ __global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, T value, const int pad_top, const int pad_left, T value,
T* out_data) { T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
int n = index / channels; int n = index / channels;
const int c = index % channels; const int c = index % channels;
const int out_w = n % out_width; const int out_w = n % out_width;
...@@ -81,7 +77,7 @@ __global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data, ...@@ -81,7 +77,7 @@ __global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
T* out_data) { T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width; int nc = index / out_width;
const int out_w = index % out_width; const int out_w = index % out_width;
const int out_h = nc % out_height; const int out_h = nc % out_height;
...@@ -103,7 +99,7 @@ __global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data, ...@@ -103,7 +99,7 @@ __global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
T* out_data) { T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
int n = index / channels; int n = index / channels;
const int c = index % channels; const int c = index % channels;
const int out_w = n % out_width; const int out_w = n % out_width;
...@@ -128,7 +124,7 @@ __global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data, ...@@ -128,7 +124,7 @@ __global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
T* out_data) { T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width; int nc = index / out_width;
const int out_w = index % out_width; const int out_w = index % out_width;
const int out_h = nc % out_height; const int out_h = nc % out_height;
...@@ -146,7 +142,7 @@ __global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data, ...@@ -146,7 +142,7 @@ __global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
T* out_data) { T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { CUDA_KERNEL_LOOP(index, nthreads) {
int n = index / channels; int n = index / channels;
const int c = index % channels; const int c = index % channels;
const int out_w = n % out_width; const int out_w = n % out_width;
...@@ -167,7 +163,7 @@ __global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data, ...@@ -167,7 +163,7 @@ __global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
const T* d_out_data) { const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(in_index, in_size) { CUDA_KERNEL_LOOP(in_index, in_size) {
int nc = in_index / in_width; int nc = in_index / in_width;
const int out_w = in_index % in_width + pad_left; const int out_w = in_index % in_width + pad_left;
const int out_h = nc % in_height + pad_top; const int out_h = nc % in_height + pad_top;
...@@ -184,7 +180,7 @@ __global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data, ...@@ -184,7 +180,7 @@ __global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
const T* d_out_data) { const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(in_index, in_size) { CUDA_KERNEL_LOOP(in_index, in_size) {
int n = in_index / channels; int n = in_index / channels;
const int c = in_index % channels; const int c = in_index % channels;
const int out_w = n % in_width + pad_left; const int out_w = n % in_width + pad_left;
...@@ -204,7 +200,7 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data, ...@@ -204,7 +200,7 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
const T* d_out_data) { const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) { CUDA_KERNEL_LOOP(out_index, out_size) {
int nc = out_index / out_width; int nc = out_index / out_width;
const int out_w = out_index % out_width; const int out_w = out_index % out_width;
const int out_h = nc % out_height; const int out_h = nc % out_height;
...@@ -228,7 +224,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data, ...@@ -228,7 +224,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
const T* d_out_data) { const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) { CUDA_KERNEL_LOOP(out_index, out_size) {
const int c = out_index % channels; const int c = out_index % channels;
int n = out_index / channels; int n = out_index / channels;
const int out_w = n % out_width; const int out_w = n % out_width;
...@@ -254,7 +250,7 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data, ...@@ -254,7 +250,7 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
const T* d_out_data) { const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) { CUDA_KERNEL_LOOP(out_index, out_size) {
int nc = out_index / out_width; int nc = out_index / out_width;
const int out_w = out_index % out_width; const int out_w = out_index % out_width;
const int out_h = nc % out_height; const int out_h = nc % out_height;
...@@ -274,7 +270,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data, ...@@ -274,7 +270,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
const int out_height, const int out_width, const int out_height, const int out_width,
const int pad_top, const int pad_left, const int pad_top, const int pad_left,
const T* d_out_data) { const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) { CUDA_KERNEL_LOOP(out_index, out_size) {
const int c = out_index % channels; const int c = out_index % channels;
int n = out_index / channels; int n = out_index / channels;
const int out_w = n % out_width; const int out_w = n % out_width;
......
...@@ -25,11 +25,6 @@ using Tensor = framework::Tensor; ...@@ -25,11 +25,6 @@ using Tensor = framework::Tensor;
#define CUDA_NUM_THREADS 1024 #define CUDA_NUM_THREADS 1024
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
inline static int PADDLE_GET_BLOCKS(const int N) { inline static int PADDLE_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
} }
......
...@@ -19,13 +19,9 @@ limitations under the License. */ ...@@ -19,13 +19,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void RangeKernel(T start, T step, int64_t size, T* out) { __global__ void RangeKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; } CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
} }
template <typename T> template <typename T>
......
...@@ -19,10 +19,6 @@ limitations under the License. */ ...@@ -19,10 +19,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024; const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) { static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
......
...@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) { ...@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks); kNumMaxinumNumBlocks);
} }
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <class T> template <class T>
__device__ T BilinearInterpolate(const T* input_data, const int height, __device__ T BilinearInterpolate(const T* input_data, const int height,
const int width, T y, T x) { const int width, T y, T x) {
...@@ -110,7 +106,7 @@ __global__ void GPUROIAlignForward( ...@@ -110,7 +106,7 @@ __global__ void GPUROIAlignForward(
const float spatial_scale, const int channels, const int height, const float spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width, const int width, const int pooled_height, const int pooled_width,
const int sampling_ratio, int* roi_batch_id_data, T* output_data) { const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width; int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height; int ph = (i / pooled_width) % pooled_height;
int c = (i / pooled_width / pooled_height) % channels; int c = (i / pooled_width / pooled_height) % channels;
...@@ -165,7 +161,7 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois, ...@@ -165,7 +161,7 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
const int pooled_width, const int pooled_width,
const int sampling_ratio, const int sampling_ratio,
int* roi_batch_id_data, T* input_grad) { int* roi_batch_id_data, T* input_grad) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width; int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height; int ph = (i / pooled_width) % pooled_height;
int c = (i / pooled_width / pooled_height) % channels; int c = (i / pooled_width / pooled_height) % channels;
......
...@@ -26,14 +26,11 @@ namespace operators { ...@@ -26,14 +26,11 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T, typename IndexT = int> template <typename T, typename IndexT = int>
__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output, __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
size_t index_size, size_t slice_size, size_t index_size, size_t slice_size,
bool overwrite) { bool overwrite) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { CUDA_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size; int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT scatter_i = indices[indices_i]; IndexT scatter_i = indices[indices_i];
...@@ -46,7 +43,7 @@ template <typename T, typename IndexT = int> ...@@ -46,7 +43,7 @@ template <typename T, typename IndexT = int>
__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices, __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
T* output, size_t index_size, T* output, size_t index_size,
size_t slice_size, bool overwrite) { size_t slice_size, bool overwrite) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) { CUDA_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size; int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT scatter_i = indices[indices_i]; IndexT scatter_i = indices[indices_i];
...@@ -64,7 +61,7 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices, ...@@ -64,7 +61,7 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
T* output, const int* output_dims, T* output, const int* output_dims,
size_t remain_size, size_t slice_size, size_t remain_size, size_t slice_size,
size_t end_size) { size_t end_size) {
CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) { CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
int indices_i = i / slice_size; int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = 0; IndexT gather_i = 0;
......
...@@ -31,15 +31,11 @@ static inline int NumBlocks(const int N) { ...@@ -31,15 +31,11 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks); kNumMaxinumNumBlocks);
} }
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, __global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
const int ignore_index, const int limit, const int ignore_index, const int limit,
T *out_data, T *counts) { T *out_data, T *counts) {
CUDA_1D_KERNEL_LOOP(i, limit) { CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i]; T x = x_data[i];
T label = label_data[i]; T label = label_data[i];
T eps = static_cast<T>(1e-5); T eps = static_cast<T>(1e-5);
...@@ -77,14 +73,14 @@ __global__ void Sum(const T *counts, int num, const T eps, T *sum) { ...@@ -77,14 +73,14 @@ __global__ void Sum(const T *counts, int num, const T eps, T *sum) {
template <typename T> template <typename T>
__global__ void Div(T *loss, const int num, const T *norm) { __global__ void Div(T *loss, const int num, const T *norm) {
CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
} }
template <typename T> template <typename T>
__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, __global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
const int ignore_index, const T *dout_data, const int ignore_index, const T *dout_data,
const int limit, T *dx_data, T *counts) { const int limit, T *dx_data, T *counts) {
CUDA_1D_KERNEL_LOOP(i, limit) { CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i]; T x = x_data[i];
T label = label_data[i]; T label = label_data[i];
T dout = dout_data[i]; T dout = dout_data[i];
......
...@@ -24,24 +24,22 @@ template <typename T> ...@@ -24,24 +24,22 @@ template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
const int n, const int d, const int remain, const int n, const int d, const int remain,
const int ignore_index) { const int ignore_index) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n * remain; CUDA_KERNEL_LOOP(index, n * remain) {
i += blockDim.x * gridDim.x) { int idx_n = index / remain;
int idx_n = i / remain; int idx_remain = index % remain;
int idx_remain = i % remain; int idx = idx_n * d + labels[index] * remain + idx_remain;
int idx = idx_n * d + labels[i] * remain + idx_remain;
logit_grad[idx] -= logit_grad[idx] -=
ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.); ignore_index == labels[index] ? static_cast<T>(0.) : static_cast<T>(1.);
} }
} }
template <typename T> template <typename T>
__global__ void Scale(T* logit_grad, const T* loss_grad, const int num, __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
const int d, const int remain) { const int d, const int remain) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; CUDA_KERNEL_LOOP(index, num) {
i += blockDim.x * gridDim.x) { int idx_n = index / d;
int idx_n = i / d; int idx_remain = index % remain;
int idx_remain = i % remain; logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
logit_grad[i] *= loss_grad[idx_n * remain + idx_remain];
} }
} }
......
...@@ -29,10 +29,6 @@ using Tensor = framework::Tensor; ...@@ -29,10 +29,6 @@ using Tensor = framework::Tensor;
using Dim3 = framework::Dim3; using Dim3 = framework::Dim3;
using Index3 = framework::Index3; using Index3 = framework::Index3;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
struct EqualTo { struct EqualTo {
constexpr bool operator()(int a, int b) const { return a == b; } constexpr bool operator()(int a, int b) const { return a == b; }
}; };
...@@ -464,7 +460,7 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input, ...@@ -464,7 +460,7 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
output_dims[pos1] = input_dims[1]; output_dims[pos1] = input_dims[1];
output_dims[pos2] = input_dims[2]; output_dims[pos2] = input_dims[2];
CUDA_1D_KERNEL_LOOP(output_index, nthreads) { CUDA_KERNEL_LOOP(output_index, nthreads) {
Index3 output_tensor_index = ConvertTensorIndex(output_index, output_dims); Index3 output_tensor_index = ConvertTensorIndex(output_index, output_dims);
Index3 input_tensor_index; Index3 input_tensor_index;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000 #if CUDA_VERSION < 9000
...@@ -26,6 +27,54 @@ enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; ...@@ -26,6 +27,54 @@ enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
namespace paddle { namespace paddle {
namespace platform { namespace platform {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP(i, num) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (int i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
class CublasHandleHolder { class CublasHandleHolder {
public: public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
......
...@@ -25,13 +25,14 @@ ...@@ -25,13 +25,14 @@
#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/cuda_helper.h"
using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using paddle::platform::float16; using paddle::platform::float16;
template <typename T> template <typename T>
__global__ void AddKernel(const T* data_a, T* data_b, size_t num) { __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; CUDA_KERNEL_LOOP(i, num) {
i += blockDim.x * gridDim.x) {
paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]); paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
} }
} }
...@@ -191,10 +192,7 @@ __forceinline__ __device__ T BlockReduce(T val) { ...@@ -191,10 +192,7 @@ __forceinline__ __device__ T BlockReduce(T val) {
template <typename T> template <typename T>
__global__ void DeviceReduceSum(T* in, T* out, size_t N) { __global__ void DeviceReduceSum(T* in, T* out, size_t N) {
T sum(0); T sum(0);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; CUDA_KERNEL_LOOP(i, N) { sum += in[i]; }
i += blockDim.x * gridDim.x) {
sum += in[i];
}
sum = BlockReduce<T>(sum); sum = BlockReduce<T>(sum);
__syncthreads(); __syncthreads();
if (threadIdx.x == 0) out[blockIdx.x] = sum; if (threadIdx.x == 0) out[blockIdx.x] = sum;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册