fix overflow in some cuda ops (#37670)

0c82e3a0 · Yang · GitHub · b0dff05d · 0c82e3a0 · 0c82e3a0
7 changed file
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -61,16 +61,16 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    auto seed_offset = gen_cuda->IncrementOffset(1);
-    int gen_offset = size * seed_offset.second;
+    int64_t gen_offset = size * seed_offset.second;
    platform::Transform<platform::CUDADeviceContext> trans;
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    auto* context =
        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
    trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
          out_data,
-          BernoulliCudaFunctor<T>(static_cast<unsigned int>(seed_offset.first),
+          BernoulliCudaFunctor<T>(static_cast<int64_t>(seed_offset.first),
-                                  static_cast<unsigned int>(gen_offset)));
+                                  static_cast<int64_t>(gen_offset)));
  }
 };

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -59,7 +59,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
    }
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    auto shape = GetShape(context);
    tensor->Resize(shape);
    T* data = tensor->mutable_data<T>(context.GetPlace());
@@ -72,7 +72,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
    if (gen_cuda->GetIsInitPy() && seed_flag) {
      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int gen_offset = size * seed_offset.second;
+      int64_t gen_offset = size * seed_offset.second;
      thrust::transform(
          index_sequence_begin, index_sequence_begin + size,
          thrust::device_ptr<T>(data),
@@ -100,7 +100,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
    }
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    int64_t size = tensor->numel();
    int device_id =
@@ -109,7 +109,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
    if (gen_cuda->GetIsInitPy() && seed_flag) {
      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int gen_offset = size * seed_offset.second;
+      int64_t gen_offset = size * seed_offset.second;
      thrust::transform(index_sequence_begin, index_sequence_begin + size,
                        thrust::device_ptr<T>(data),
                        GaussianGenerator<T>(mean, std, seed_offset.first,

--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -129,7 +129,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
    int64_t size = size_to_axis * size_from_axis;
    T* random_data =
        random_tensor.mutable_data<T>({size}, platform::CUDAPlace());
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    // generate gumbel noise
    int device_id =
@@ -137,7 +137,7 @@ struct GumbleNoiseGenerator<platform::CUDADeviceContext, T> {
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    if (gen_cuda->GetIsInitPy()) {
      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int gen_offset = size * seed_offset.second;
+      int64_t gen_offset = size * seed_offset.second;
      thrust::transform(
          index_sequence_begin, index_sequence_begin + size,
          thrust::device_ptr<T>(random_data),

--- a/paddle/fluid/operators/multinomial_op.cu
+++ b/paddle/fluid/operators/multinomial_op.cu
@@ -239,7 +239,7 @@ class MultinomialOpKernel<platform::CUDADeviceContext, T>
    auto* rng_data = rng_data_tensor.mutable_data<T>(
        {num_distributions, num_samples}, ctx.GetPlace());
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    platform::Transform<platform::CUDADeviceContext> trans;
    auto* context =
        static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());

--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -97,7 +97,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
    }
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    int64_t size = tensor->numel();
    int device_id =
@@ -106,7 +106,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
    if (gen_cuda->GetIsInitPy() && seed_flag) {
      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int gen_offset = size * seed_offset.second;
+      int64_t gen_offset = size * seed_offset.second;
      thrust::transform(
          index_sequence_begin, index_sequence_begin + size,
          thrust::device_ptr<T>(data),

--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -118,14 +118,14 @@ class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
    unsigned int diag_step =
        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
    T diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    int64_t size = tensor->numel();
    int device_id =
        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    if (gen_cuda->GetIsInitPy() && seed_flag) {
      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int gen_offset = size * seed_offset.second;
+      int64_t gen_offset = size * seed_offset.second;
      thrust::transform(
          index_sequence_begin, index_sequence_begin + size,
          thrust::device_ptr<T>(data),

--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -139,14 +139,14 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    unsigned int diag_step =
        static_cast<unsigned int>(context.Attr<int>("diag_step"));
    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
    int64_t size = tensor->numel();
    int device_id =
        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    if (gen_cuda->GetIsInitPy() && seed_flag) {
      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int gen_offset = size * seed_offset.second;
+      int64_t gen_offset = size * seed_offset.second;
      thrust::transform(
          index_sequence_begin, index_sequence_begin + size,
          thrust::device_ptr<T>(data),