未验证 提交 e947d11e 编写于 作者: C Chen Weihang 提交者: GitHub

Fix index overflow bug of the CUDA kernel loop increment (#25435) (#25727)

* fix softmax_with_cross_entropy cuda kernel overflow bug, test=develop

* replace old macro & for condition, test=develop

* polish details, test=develop
上级 54100355
......@@ -23,9 +23,6 @@
namespace paddle {
namespace framework {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
__global__ void PullCopy(float** dest, const boxps::FeatureValueGpu* src,
const int64_t* len, int hidden, int slot_num,
......
......@@ -22,10 +22,7 @@
#include "paddle/fluid/platform/place.h"
__global__ void test(size_t* a, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
a[i] *= 2;
}
CUDA_KERNEL_LOOP(i, size) { a[i] *= 2; }
}
TEST(LoD, data) {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cublas.h>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using framework::Tensor;
const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template <typename T>
__global__ void add_bias_kernel(T* data, int slot_pairs_num, int ins_num,
int out_dim, const T* bias) {
CUDA_KERNEL_LOOP(idx, slot_pairs_num * ins_num * out_dim) {
int block_len = ins_num * out_dim;
int slot_index = idx / block_len;
int out_dim_index = (idx % block_len) % out_dim;
T temp = data[idx] + bias[slot_index * out_dim + out_dim_index];
data[idx] = temp;
}
}
template <typename T>
void add_bias(cudaStream_t stream, T* data, int slot_pairs_num, int ins_num,
int out_dim, const T* bias) {
add_bias_kernel<<<GET_BLOCKS(slot_pairs_num * ins_num * out_dim),
CUDA_NUM_THREADS, 0, stream>>>(data, slot_pairs_num,
ins_num, out_dim, bias);
}
template <typename T>
__global__ void add_bias_grad_kernel(const T* dout_data, int slot_pairs_num,
int ins_num, int out_dim, T* db_data) {
CUDA_KERNEL_LOOP(idx, slot_pairs_num * out_dim) {
int row = idx / out_dim;
int col = idx % out_dim;
T temp = static_cast<T>(0);
for (int i = 0; i < ins_num; ++i) {
int select_indx = ((row + 1) * i + 1) * col;
temp += dout_data[select_indx];
}
db_data[idx] += temp;
}
}
template <typename T>
void add_bias_grad(cudaStream_t stream, const T* dout_data, int slot_pairs_num,
int ins_num, int out_dim, T* db_data) {
add_bias_grad_kernel<<<GET_BLOCKS(slot_pairs_num * out_dim), CUDA_NUM_THREADS,
0, stream>>>(dout_data, slot_pairs_num, ins_num,
out_dim, db_data);
}
template <typename DeviceContext, typename T>
class BatchFCCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// X.dim = slot_pairs_num * ins_num * in_dim
// W.dim = slot_pairs_num * in_dim * out_dim
// b.dim = slot_pairs_num * out_dim
// output.dim = slot_pairs_num * ins_num * out_dim
auto* input = ctx.Input<framework::LoDTensor>("Input");
auto* w = ctx.Input<Tensor>("W");
auto* bias = ctx.Input<Tensor>("Bias");
auto* output = ctx.Output<framework::LoDTensor>("Out");
auto input_dims = input->dims();
auto w_dims = w->dims();
auto slot_pairs_num = input_dims[0];
auto ins_num = input_dims[1];
auto in_dim = input_dims[2];
auto out_dim = w_dims[2];
// get data ptr
const T* in_data = input->data<T>();
const T* w_data = w->data<T>();
const T* bias_data = bias->data<T>();
output->Resize({slot_pairs_num, ins_num, out_dim});
T* out_data = output->mutable_data<T>(ctx.GetPlace());
// initialize
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
CBLAS_TRANSPOSE transA = CblasNoTrans;
CBLAS_TRANSPOSE transB = CblasNoTrans;
T alpha = 1;
T beta = 0;
int64_t strideA = ins_num * in_dim;
int64_t strideB = in_dim * out_dim;
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
blas.BatchedGEMM(transA, transB, ins_num, out_dim, in_dim, alpha, in_data,
w_data, beta, out_data, slot_pairs_num, strideA, strideB);
add_bias<T>(ctx.cuda_device_context().stream(), out_data, slot_pairs_num,
ins_num, out_dim, bias_data);
}
};
template <typename DeviceContext, typename T>
class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("Input");
auto* w = ctx.Input<Tensor>("W");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto* dw = ctx.Output<Tensor>(framework::GradVarName("W"));
auto* db = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto input_dims = input->dims();
auto w_dims = w->dims();
auto slot_pairs_num = input_dims[0];
auto ins_num = input_dims[1];
auto in_dim = input_dims[2];
auto out_dim = w_dims[2];
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
.eigen_device();
// initialize
dx->mutable_data<T>(ctx.GetPlace());
auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
dx_eigen.device(place) = dx_eigen.constant(static_cast<T>(0));
dw->mutable_data<T>(ctx.GetPlace());
auto dw_eigen = framework::EigenVector<T>::Flatten(*dw);
dw_eigen.device(place) = dw_eigen.constant(static_cast<T>(0));
// get data ptr
const T* x_data = input->data<T>();
const T* w_data = w->data<T>();
const T* dout_data = dout->data<T>();
T* dx_data = dx->data<T>();
T* dw_data = dw->data<T>();
db->mutable_data<T>(ctx.GetPlace());
auto db_eigen = framework::EigenVector<T>::Flatten(*db);
db_eigen.device(place) = db_eigen.constant(static_cast<T>(0));
T* db_data = db->data<T>();
add_bias_grad<T>(ctx.cuda_device_context().stream(), dout_data,
slot_pairs_num, ins_num, out_dim, db_data);
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1;
T beta = 0;
// dx = dout_data * y^T
blas.BatchedGEMM(CblasNoTrans, CblasTrans, ins_num, in_dim, out_dim, alpha,
dout_data, w_data, beta, dx_data, slot_pairs_num,
ins_num * out_dim, out_dim * in_dim);
// dy = x^T * dout_data
blas.BatchedGEMM(CblasTrans, CblasNoTrans, in_dim, out_dim, ins_num, alpha,
x_data, dout_data, beta, dw_data, slot_pairs_num,
in_dim * ins_num, ins_num * out_dim);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using GPUCtx = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(batch_fc, ops::BatchFCCUDAKernel<GPUCtx, float>,
ops::BatchFCCUDAKernel<GPUCtx, double>);
REGISTER_OP_CUDA_KERNEL(batch_fc_grad,
ops::BatchFCGradOpCUDAKernel<GPUCtx, float>,
ops::BatchFCGradOpCUDAKernel<GPUCtx, double>);
......@@ -24,14 +24,10 @@ namespace operators {
using Tensor = framework::Tensor;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GPUBCELossForward(const T* x_data, const T* label_data,
T* out_data, const int in_numel) {
CUDA_1D_KERNEL_LOOP(i, in_numel) {
CUDA_KERNEL_LOOP(i, in_numel) {
T x = x_data[i];
T label = label_data[i];
T one = static_cast<T>(1.);
......@@ -48,7 +44,7 @@ template <typename T>
__global__ void GPUBCELossBackward(const T* x_data, const T* label_data,
const T* dout_data, T* dx_data,
const int in_numel) {
CUDA_1D_KERNEL_LOOP(i, in_numel) {
CUDA_KERNEL_LOOP(i, in_numel) {
T x = x_data[i];
T label = label_data[i];
T dout = dout_data[i];
......
......@@ -25,10 +25,6 @@ using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width,
const T* X, T* Y, int64_t numel) {
......
......@@ -30,10 +30,6 @@ using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
}
......
......@@ -40,10 +40,6 @@ namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
......
......@@ -24,8 +24,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
const int width, const T offset) {
int num_anchors = as_num * ar_num;
int box_num = height * width * num_anchors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, box_num) {
int h_idx = i / (num_anchors * width);
int w_idx = (i / num_anchors) % width;
T stride_width = stride[0];
......@@ -64,10 +63,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
}
template <typename T>
......
......@@ -40,8 +40,7 @@ static inline int NumBlocks(const int N) {
static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids,
int* length_lod) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (nthreads);
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, nthreads) {
platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
}
}
......
......@@ -31,10 +31,6 @@ using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 64;
static constexpr int kNumMaxinumNumBlocks = 4096;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const BBoxSize = 4;
static inline int NumBlocks(const int N) {
......@@ -48,7 +44,7 @@ __global__ void GPUDistFpnProposalsHelper(
const int refer_level, const int refer_scale, const int max_level,
const int min_level, int* roi_batch_id_data, int* sub_lod_list,
int* target_lvls) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
CUDA_KERNEL_LOOP(i, nthreads) {
const T* offset_roi = rois + i * BBoxSize;
int roi_batch_ind = roi_batch_id_data[i];
// get the target level of current rois
......
......@@ -33,9 +33,6 @@ using LoDTensor = framework::LoDTensor;
namespace {
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
int const kThreadsPerBlock = sizeof(uint64_t) * 8;
......@@ -155,7 +152,7 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
int cnt = 0;
__shared__ int keep_index[BlockSize];
CUDA_1D_KERNEL_LOOP(i, num) {
CUDA_KERNEL_LOOP(i, num) {
keep_index[threadIdx.x] = -1;
__syncthreads();
......
......@@ -32,8 +32,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
bool min_max_aspect_ratios_order) {
int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
int box_num = height * width * num_priors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, box_num) {
int h = i / (num_priors * width);
int w = (i / num_priors) % width;
int p = i % num_priors;
......@@ -87,10 +86,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
}
template <typename T>
......
......@@ -30,10 +30,6 @@ namespace operators {
#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__device__ bool GT_E(T a, T b) {
return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
......@@ -284,7 +280,7 @@ __global__ void RoiTransformKernel(const float* input_data,
int* mask, T* transform_matrix) {
int output_size =
num_rois * transformed_height * transformed_width * channels;
CUDA_1D_KERNEL_LOOP(index, output_size) {
CUDA_KERNEL_LOOP(index, output_size) {
// (n, c, out_h, out_w) is an element in the transformed output
int out_w = idx4_4(index, num_rois, channels, transformed_height,
transformed_width);
......@@ -463,7 +459,7 @@ __global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data,
const T* out2in_w_data,
const T* out_grad_data,
T* in_grad_data) {
CUDA_1D_KERNEL_LOOP(index, out_size * 4) {
CUDA_KERNEL_LOOP(index, out_size * 4) {
int in_idx = out2in_idx_data[index];
if (in_idx >= 0) {
int out_idx = index / 4;
......
......@@ -30,10 +30,6 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks);
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GPUSigmoidFocalLossForward(const T *x_data,
const int *label_data,
......@@ -41,7 +37,7 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data,
const T gamma, const T alpha,
const int num_classes,
const int limit, T *out_data) {
CUDA_1D_KERNEL_LOOP(i, limit) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
int a = i / num_classes; // current sample
int d = i % num_classes; // current class
......@@ -79,7 +75,7 @@ __global__ void GPUSigmoidFocalLossBackward(
const T *x_data, const int *label_data, const int *fg_num_data,
const T gamma, const T alpha, const int num_classes, const T *dout_data,
const int limit, T *dx_data) {
CUDA_1D_KERNEL_LOOP(i, limit) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
T dout = dout_data[i];
......
......@@ -27,15 +27,11 @@ namespace operators {
using framework::Tensor;
using platform::DeviceContext;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T, typename IndexT = int>
__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
T* output, size_t index_size,
size_t slice_size) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
CUDA_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = indices[indices_i];
......@@ -49,7 +45,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
const IndexT* indices, T* output,
size_t remain_size, size_t slice_size,
size_t end_size) {
CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = 0;
......
......@@ -19,15 +19,11 @@ limitations under the License. */
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GatherTree(const T *ids_data, const T *parents_data,
T *out_data, const int64_t max_length,
const int64_t batch_size, const int64_t beam_size) {
CUDA_1D_KERNEL_LOOP(i, batch_size * beam_size) {
CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
int batch = i / beam_size;
int beam = i % beam_size;
auto idx =
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/histogram_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/gpu_launch_config.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
using IndexType = int64_t;
using Tensor = framework::Tensor;
using platform::PADDLE_CUDA_NUM_THREADS;
inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
}
template <typename T, typename IndexType>
__device__ static IndexType GetBin(T bVal, T minvalue, T maxvalue,
int64_t nbins) {
IndexType bin =
static_cast<int>((bVal - minvalue) * nbins / (maxvalue - minvalue));
if (bin == nbins) bin -= 1;
return bin;
}
template <typename T, typename IndexType>
__global__ void KernelHistogram(const T* input, const int totalElements,
const int64_t nbins, const T minvalue,
const T maxvalue, int64_t* output) {
CUDA_KERNEL_LOOP(linearIndex, totalElements) {
const IndexType inputIdx = threadIdx.x + blockIdx.x * blockDim.x;
const auto inputVal = input[inputIdx];
if (inputVal >= minvalue && inputVal <= maxvalue) {
const IndexType bin =
GetBin<T, IndexType>(inputVal, minvalue, maxvalue, nbins);
const IndexType outputIdx = bin < nbins - 1 ? bin : nbins - 1;
paddle::platform::CudaAtomicAdd(&output[outputIdx], 1);
}
}
}
template <typename DeviceContext, typename T>
class HistogramCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(context.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
const Tensor* input = context.Input<framework::Tensor>("X");
Tensor* output = context.Output<framework::Tensor>("Out");
auto& nbins = context.Attr<int64_t>("bins");
auto& minval = context.Attr<int>("min");
auto& maxval = context.Attr<int>("max");
const T* input_data = input->data<T>();
const int input_numel = input->numel();
T output_min = static_cast<T>(minval);
T output_max = static_cast<T>(maxval);
if (output_min == output_max) {
auto input_x = framework::EigenVector<T>::Flatten(*input);
framework::Tensor input_min_t, input_max_t;
auto* input_min_data =
input_min_t.mutable_data<T>({1}, context.GetPlace());
auto* input_max_data =
input_max_t.mutable_data<T>({1}, context.GetPlace());
auto input_min_scala = framework::EigenScalar<T>::From(input_min_t);
auto input_max_scala = framework::EigenScalar<T>::From(input_max_t);
auto* place =
context.template device_context<DeviceContext>().eigen_device();
input_min_scala.device(*place) = input_x.minimum();
input_max_scala.device(*place) = input_x.maximum();
Tensor input_min_cpu, input_max_cpu;
TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu);
TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu);
output_min = input_min_cpu.data<T>()[0];
output_max = input_max_cpu.data<T>()[0];
}
if (output_min == output_max) {
output_min = output_min - 1;
output_max = output_max + 1;
}
PADDLE_ENFORCE_EQ(
(std::isinf(static_cast<float>(output_min)) ||
std::isnan(static_cast<float>(output_max)) ||
std::isinf(static_cast<float>(output_min)) ||
std::isnan(static_cast<float>(output_max))),
false, platform::errors::OutOfRange("range of min, max is not finite"));
PADDLE_ENFORCE_GE(
output_max, output_min,
platform::errors::InvalidArgument(
"max must be larger or equal to min. If min and max are both zero, "
"the minimum and maximum values of the data are used. "
"But received max is %d, min is %d",
maxval, minval));
int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, int64_t>()(
context.template device_context<platform::CUDADeviceContext>(), output,
static_cast<int64_t>(0));
auto stream =
context.template device_context<platform::CUDADeviceContext>().stream();
KernelHistogram<T, IndexType><<<GET_BLOCKS(input_numel),
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
input_data, input_numel, nbins, output_min, output_max, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
histogram,
ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::HistogramCUDAKernel<paddle::platform::CUDADeviceContext, double>);
......@@ -35,8 +35,7 @@ using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
template <typename T>
static __global__ void repeat_param(const T *input, T *output,
const int repeat_num, const int C) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, repeat_num * C) {
int index = i % C;
output[i] = input[index];
}
......
......@@ -19,13 +19,9 @@ limitations under the License. */
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
......
......@@ -24,10 +24,6 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename Dtype>
__device__ Dtype cuda_sigmoid(const Dtype x) {
return Dtype(1) / (Dtype(1) + exp(-x));
......@@ -42,7 +38,7 @@ template <typename T>
__global__ void LSTMUnitKernel(const int nthreads, const int dim,
const T* C_prev, const T* X, T* C, T* H,
const T forget_bias) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / dim;
const int d = index % dim;
......@@ -65,7 +61,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
const T* C_diff, const T* H_diff,
T* C_prev_diff, T* X_diff,
const T forget_bias) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / dim;
const int d = index % dim;
const T* X_offset = X + 4 * dim * n;
......
......@@ -25,8 +25,7 @@ template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D,
const int ignore_index) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, N) {
PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
"label[%d] expected >= 0 and < %ld, or == %ld, but got "
"%ld. Please check input value.",
......
......@@ -75,8 +75,7 @@ template <typename T>
__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
int num) {
T tmp = 1.0 / width;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, num) {
int h = i * tmp;
int w = i - h * width;
c[i] = a[i] + b[w];
......
......@@ -23,10 +23,6 @@ namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void CountCUDAKernel(const int num_classes, const int count,
const T* predictions, const T* labels,
......@@ -42,7 +38,7 @@ __global__ void CountCUDAKernel(const int num_classes, const int count,
T pred;
T label;
CUDA_1D_KERNEL_LOOP(i, count) {
CUDA_KERNEL_LOOP(i, count) {
pred = predictions[i];
label = labels[i];
if (pred == label) {
......@@ -68,7 +64,7 @@ __global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
valid_count_c = 0;
}
__syncthreads();
CUDA_1D_KERNEL_LOOP(i, num_classes) {
CUDA_KERNEL_LOOP(i, num_classes) {
int wrong_n = wrong[i];
int correct_n = correct[i];
int denominator = wrong_n + correct_n;
......
......@@ -23,9 +23,6 @@ namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
const int bucket_length,
......
......@@ -30,10 +30,6 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks);
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
const int64_t* label_data,
......@@ -41,7 +37,7 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
const int64_t batch_size,
const int64_t n_classes,
const int64_t ignore_index) {
CUDA_1D_KERNEL_LOOP(i, batch_size) {
CUDA_KERNEL_LOOP(i, batch_size) {
const int64_t cur_label = label_data[i];
if (cur_label == ignore_index) {
out_data[i] = 0;
......@@ -190,7 +186,7 @@ __global__ void GPUNLLLossForward2D_no_reduce(
const int64_t map_size = in_dim2 * in_dim3;
const int64_t sample_size = n_classes * map_size;
const int64_t out_numel = batch_size * map_size;
CUDA_1D_KERNEL_LOOP(i, out_numel) {
CUDA_KERNEL_LOOP(i, out_numel) {
const int64_t b = i % batch_size;
const int64_t h = (i / batch_size) % in_dim2;
const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
......@@ -260,7 +256,7 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
T* dx_data, const int64_t* label_data, const T* weight_data,
const T* dout_data, const int64_t batch_size, const int64_t n_classes,
const int64_t ignore_index) {
CUDA_1D_KERNEL_LOOP(i, batch_size) {
CUDA_KERNEL_LOOP(i, batch_size) {
const int64_t cur_label = label_data[i];
if (cur_label == ignore_index) {
continue;
......@@ -298,7 +294,7 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
const int64_t map_size = in_dim2 * in_dim3;
const int64_t sample_size = n_classes * map_size;
const int64_t out_numel = batch_size * map_size;
CUDA_1D_KERNEL_LOOP(i, out_numel) {
CUDA_KERNEL_LOOP(i, out_numel) {
const int64_t b = i % batch_size;
const int64_t h = (i / batch_size) % in_dim2;
const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
......
......@@ -26,8 +26,7 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* g_norm, T* p_out, T* v_out) {
T lr = learning_rate[0];
T local_lr = learning_rate[0];
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, num) {
if (p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0]);
......
......@@ -25,8 +25,7 @@ template <typename T>
__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
const int num, T* p_out) {
T lr = learning_rate[0];
int grid_size = blockDim.x * gridDim.x;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
CUDA_KERNEL_LOOP(i, num) {
T g_data = g[i];
T p_data = p[i];
p_out[i] = p_data - lr * g_data;
......
......@@ -23,10 +23,6 @@ namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
using framework::Tensor;
template <typename T>
......@@ -36,7 +32,7 @@ __global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T value,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width;
const int out_w = index % out_width;
const int out_h = nc % out_height;
......@@ -57,7 +53,7 @@ __global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left, T value,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
int n = index / channels;
const int c = index % channels;
const int out_w = n % out_width;
......@@ -81,7 +77,7 @@ __global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width;
const int out_w = index % out_width;
const int out_h = nc % out_height;
......@@ -103,7 +99,7 @@ __global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
int n = index / channels;
const int c = index % channels;
const int out_w = n % out_width;
......@@ -128,7 +124,7 @@ __global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
int nc = index / out_width;
const int out_w = index % out_width;
const int out_h = nc % out_height;
......@@ -146,7 +142,7 @@ __global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
T* out_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
CUDA_KERNEL_LOOP(index, nthreads) {
int n = index / channels;
const int c = index % channels;
const int out_w = n % out_width;
......@@ -167,7 +163,7 @@ __global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(in_index, in_size) {
CUDA_KERNEL_LOOP(in_index, in_size) {
int nc = in_index / in_width;
const int out_w = in_index % in_width + pad_left;
const int out_h = nc % in_height + pad_top;
......@@ -184,7 +180,7 @@ __global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(in_index, in_size) {
CUDA_KERNEL_LOOP(in_index, in_size) {
int n = in_index / channels;
const int c = in_index % channels;
const int out_w = n % in_width + pad_left;
......@@ -204,7 +200,7 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
CUDA_KERNEL_LOOP(out_index, out_size) {
int nc = out_index / out_width;
const int out_w = out_index % out_width;
const int out_h = nc % out_height;
......@@ -228,7 +224,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
CUDA_KERNEL_LOOP(out_index, out_size) {
const int c = out_index % channels;
int n = out_index / channels;
const int out_w = n % out_width;
......@@ -254,7 +250,7 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
CUDA_KERNEL_LOOP(out_index, out_size) {
int nc = out_index / out_width;
const int out_w = out_index % out_width;
const int out_h = nc % out_height;
......@@ -274,7 +270,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
const int out_height, const int out_width,
const int pad_top, const int pad_left,
const T* d_out_data) {
CUDA_1D_KERNEL_LOOP(out_index, out_size) {
CUDA_KERNEL_LOOP(out_index, out_size) {
const int c = out_index % channels;
int n = out_index / channels;
const int out_w = n % out_width;
......
......@@ -25,11 +25,6 @@ using Tensor = framework::Tensor;
#define CUDA_NUM_THREADS 1024
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
inline static int PADDLE_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
......
......@@ -19,13 +19,9 @@ limitations under the License. */
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
......
......@@ -19,10 +19,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
......
......@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks);
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <class T>
__device__ T BilinearInterpolate(const T* input_data, const int height,
const int width, T y, T x) {
......@@ -110,7 +106,7 @@ __global__ void GPUROIAlignForward(
const float spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
CUDA_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height;
int c = (i / pooled_width / pooled_height) % channels;
......@@ -165,7 +161,7 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
const int pooled_width,
const int sampling_ratio,
int* roi_batch_id_data, T* input_grad) {
CUDA_1D_KERNEL_LOOP(i, nthreads) {
CUDA_KERNEL_LOOP(i, nthreads) {
int pw = i % pooled_width;
int ph = (i / pooled_width) % pooled_height;
int c = (i / pooled_width / pooled_height) % channels;
......
......@@ -26,14 +26,11 @@ namespace operators {
using Tensor = framework::Tensor;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T, typename IndexT = int>
__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
size_t index_size, size_t slice_size,
bool overwrite) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
CUDA_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT scatter_i = indices[indices_i];
......@@ -46,7 +43,7 @@ template <typename T, typename IndexT = int>
__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
T* output, size_t index_size,
size_t slice_size, bool overwrite) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
CUDA_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT scatter_i = indices[indices_i];
......@@ -64,7 +61,7 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
T* output, const int* output_dims,
size_t remain_size, size_t slice_size,
size_t end_size) {
CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
IndexT gather_i = 0;
......
......@@ -31,15 +31,11 @@ static inline int NumBlocks(const int N) {
kNumMaxinumNumBlocks);
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
const int ignore_index, const int limit,
T *out_data, T *counts) {
CUDA_1D_KERNEL_LOOP(i, limit) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
T label = label_data[i];
T eps = static_cast<T>(1e-5);
......@@ -77,14 +73,14 @@ __global__ void Sum(const T *counts, int num, const T eps, T *sum) {
template <typename T>
__global__ void Div(T *loss, const int num, const T *norm) {
CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
}
template <typename T>
__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
const int ignore_index, const T *dout_data,
const int limit, T *dx_data, T *counts) {
CUDA_1D_KERNEL_LOOP(i, limit) {
CUDA_KERNEL_LOOP(i, limit) {
T x = x_data[i];
T label = label_data[i];
T dout = dout_data[i];
......
......@@ -24,11 +24,10 @@ template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
const int n, const int d, const int remain,
const int ignore_index) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n * remain;
i += blockDim.x * gridDim.x) {
int idx_n = i / remain;
int idx_remain = i % remain;
int tmp = labels[i];
CUDA_KERNEL_LOOP(index, n * remain) {
int idx_n = index / remain;
int idx_remain = index % remain;
int tmp = labels[index];
if (ignore_index != tmp) {
int idx = idx_n * d + tmp * remain + idx_remain;
logit_grad[idx] -= static_cast<T>(1.);
......@@ -39,11 +38,10 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
template <typename T>
__global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
const int d, const int remain) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
int idx_n = i / d;
int idx_remain = i % remain;
logit_grad[i] *= loss_grad[idx_n * remain + idx_remain];
CUDA_KERNEL_LOOP(index, num) {
int idx_n = index / d;
int idx_remain = index % remain;
logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
}
}
......
......@@ -29,10 +29,6 @@ using Tensor = framework::Tensor;
using Dim3 = framework::Dim3;
using Index3 = framework::Index3;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
struct EqualTo {
constexpr bool operator()(int a, int b) const { return a == b; }
};
......@@ -464,7 +460,7 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
output_dims[pos1] = input_dims[1];
output_dims[pos2] = input_dims[2];
CUDA_1D_KERNEL_LOOP(output_index, nthreads) {
CUDA_KERNEL_LOOP(output_index, nthreads) {
Index3 output_tensor_index = ConvertTensorIndex(output_index, output_dims);
Index3 input_tensor_index;
......
......@@ -17,6 +17,7 @@
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#if CUDA_VERSION < 9000
......@@ -26,6 +27,54 @@ enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
namespace paddle {
namespace platform {
/*
* Summary: Grid stride looping macro in CUDA kernel
*
* [ Why need this macro? ]
*
* The original looping in CUDA kernel is:
*
* `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
* i += blockDim.x * gridDim.x)`
*
* This for condition is risky. The value of `blockIdx.x * blockDim.x`
* may be large, such as over 1GB, the first iteration is no problem here,
* but when `i += blockDim.x * gridDim.x` is executed, the value of i
* will greater than INT_MAX and overflow becomes negative value, at
* this time, the cycle condition `i < (n)` is still satisfied, so it
* will cause illegal access to cuda memory.
*
* Here is a real example in ERINE, it will trigger above error.
* The related data are:
* - blockIdx.x = 2172938
* - blockDim.x = 512
* - blockIdx.x * blockDim.x = 1112543864
* - INT_MAX = 2147483647
*
* So we polish the for condition as follow, the int64_t __index__ will
* prevent overflow in the loop increment.
*
* Parameters:
* - i: loop index
* - num: total element numbers
*
* Examples:
* template <typename T>
* __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
* const int d, const int remain) {
* CUDA_KERNEL_LOOP(index, num) {
* int idx_n = index / d;
* int idx_remain = index % remain;
* logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
* }
* }
*
*/
#define CUDA_KERNEL_LOOP(i, num) \
int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
for (int i = __index__; __index__ < (num); \
__index__ += blockDim.x * gridDim.x, i = __index__)
class CublasHandleHolder {
public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
......
......@@ -25,13 +25,14 @@
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/cuda_helper.h"
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using paddle::platform::float16;
template <typename T>
__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
CUDA_KERNEL_LOOP(i, num) {
paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
}
}
......@@ -191,10 +192,7 @@ __forceinline__ __device__ T BlockReduce(T val) {
template <typename T>
__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
T sum(0);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
sum += in[i];
}
CUDA_KERNEL_LOOP(i, N) { sum += in[i]; }
sum = BlockReduce<T>(sum);
__syncthreads();
if (threadIdx.x == 0) out[blockIdx.x] = sum;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册