未验证 提交 0c82e3a0 编写于 作者: Y Yang 提交者: GitHub

fix overflow in some cuda ops (#37670)

上级 b0dff05d
...@@ -61,16 +61,16 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T> ...@@ -61,16 +61,16 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
platform::Transform<platform::CUDADeviceContext> trans; platform::Transform<platform::CUDADeviceContext> trans;
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
auto* context = auto* context =
static_cast<const platform::CUDADeviceContext*>(&ctx.device_context()); static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
trans(*context, index_sequence_begin, index_sequence_begin + size, in_data, trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
out_data, out_data,
BernoulliCudaFunctor<T>(static_cast<unsigned int>(seed_offset.first), BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
static_cast<unsigned int>(gen_offset))); static_cast<int64_t>(gen_offset)));
} }
}; };
......
...@@ -59,7 +59,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -59,7 +59,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
} }
T mean = static_cast<T>(context.Attr<float>("mean")); T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std")); T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
auto shape = GetShape(context); auto shape = GetShape(context);
tensor->Resize(shape); tensor->Resize(shape);
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
...@@ -72,7 +72,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -72,7 +72,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
if (gen_cuda->GetIsInitPy() && seed_flag) { if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
thrust::transform( thrust::transform(
index_sequence_begin, index_sequence_begin + size, index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data), thrust::device_ptr<T>(data),
...@@ -100,7 +100,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { ...@@ -100,7 +100,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
} }
T mean = static_cast<T>(context.Attr<float>("mean")); T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std")); T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
int device_id = int device_id =
...@@ -109,7 +109,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { ...@@ -109,7 +109,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
if (gen_cuda->GetIsInitPy() && seed_flag) { if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
thrust::transform(index_sequence_begin, index_sequence_begin + size, thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data), thrust::device_ptr<T>(data),
GaussianGenerator<T>(mean, std, seed_offset.first, GaussianGenerator<T>(mean, std, seed_offset.first,
......
...@@ -129,7 +129,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> { ...@@ -129,7 +129,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
int64_t size = size_to_axis * size_from_axis; int64_t size = size_to_axis * size_from_axis;
T* random_data = T* random_data =
random_tensor.mutable_data<T>({size}, platform::CUDAPlace()); random_tensor.mutable_data<T>({size}, platform::CUDAPlace());
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
// generate gumbel noise // generate gumbel noise
int device_id = int device_id =
...@@ -137,7 +137,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> { ...@@ -137,7 +137,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy()) { if (gen_cuda->GetIsInitPy()) {
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
thrust::transform( thrust::transform(
index_sequence_begin, index_sequence_begin + size, index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(random_data), thrust::device_ptr<T>(random_data),
......
...@@ -239,7 +239,7 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T> ...@@ -239,7 +239,7 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
auto* rng_data = rng_data_tensor.mutable_data<T>( auto* rng_data = rng_data_tensor.mutable_data<T>(
{num_distributions, num_samples}, ctx.GetPlace()); {num_distributions, num_samples}, ctx.GetPlace());
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
platform::Transform<platform::CUDADeviceContext> trans; platform::Transform<platform::CUDADeviceContext> trans;
auto* context = auto* context =
static_cast<const platform::CUDADeviceContext*>(&ctx.device_context()); static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
......
...@@ -97,7 +97,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -97,7 +97,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
} }
T mean = static_cast<T>(context.Attr<float>("mean")); T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.Attr<float>("std")); T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
int device_id = int device_id =
...@@ -106,7 +106,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> { ...@@ -106,7 +106,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
if (gen_cuda->GetIsInitPy() && seed_flag) { if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
thrust::transform( thrust::transform(
index_sequence_begin, index_sequence_begin + size, index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data), thrust::device_ptr<T>(data),
......
...@@ -118,14 +118,14 @@ class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> { ...@@ -118,14 +118,14 @@ class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
unsigned int diag_step = unsigned int diag_step =
static_cast<unsigned int>(ctx.Attr<int>("diag_step")); static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
T diag_val = static_cast<T>(ctx.Attr<float>("diag_val")); T diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
int device_id = int device_id =
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId(); BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) { if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
thrust::transform( thrust::transform(
index_sequence_begin, index_sequence_begin + size, index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data), thrust::device_ptr<T>(data),
......
...@@ -139,14 +139,14 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -139,14 +139,14 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
unsigned int diag_step = unsigned int diag_step =
static_cast<unsigned int>(context.Attr<int>("diag_step")); static_cast<unsigned int>(context.Attr<int>("diag_step"));
T diag_val = static_cast<T>(context.Attr<float>("diag_val")); T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<int64_t> index_sequence_begin(0);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
int device_id = int device_id =
BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId(); BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
if (gen_cuda->GetIsInitPy() && seed_flag) { if (gen_cuda->GetIsInitPy() && seed_flag) {
auto seed_offset = gen_cuda->IncrementOffset(1); auto seed_offset = gen_cuda->IncrementOffset(1);
int gen_offset = size * seed_offset.second; int64_t gen_offset = size * seed_offset.second;
thrust::transform( thrust::transform(
index_sequence_begin, index_sequence_begin + size, index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(data), thrust::device_ptr<T>(data),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册