improve forward performace (#38279)

acef85b2 · Zhang Ting · GitHub · e6c3f64f · acef85b2
隐藏空白更改
内联并排

Showing with 19 addition and 20 deletion

paddle/fluid/operators/dropout_impl.cu.h paddle/fluid/operators/dropout_impl.cu.h +19 -20

未找到文件。
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
 namespace paddle {
 namespace operators {
@@ -180,9 +181,6 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
      return;
    }
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, size);
    // increment is used to set the args(offset) of curand_init, which defines
    // offset in subsequence.
    // The detail:
@@ -192,11 +190,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
    // same as the previous calls.
    uint64_t seed_data;
    uint64_t increment;
-    int vec_size = platform::GetVectorizedSize<T>(x_data);
+    // VectorizedRandomGenerator use curand_uniform4, so we only support
-    auto offset = ((x_numel - 1) / (config.block_per_grid.x *
+    // vec_size is 4;
-                                    config.thread_per_block.x * vec_size) +
+    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-                   1) *
+    int block_size = pten::GetThreadsConfig(dev_ctx, x_numel, vec_size);
-                  vec_size;
+    int grid_size =
+        ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
+    auto offset =
+        ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size;
    GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
                            &seed_data, &increment);
@@ -204,26 +206,23 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 #ifdef __HIPCC__
    if (vec_size == 4 && size % 4 == 0) {
      hipLaunchKernelGGL(
-          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
+          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), grid_size,
-          config.block_per_grid, config.thread_per_block, 0, stream, size,
+          block_size, 0, stream, size, seed_data, dropout_prob, x_data,
-          seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
+          mask_data, y_data, upscale_in_train, increment);
-          increment);
    } else {
      hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
-                         config.block_per_grid, config.thread_per_block, 0,
+                         grid_size, block_size, 0, stream, size, seed_data,
-                         stream, size, seed_data, dropout_prob, x_data,
+                         dropout_prob, x_data, mask_data, y_data,
-                         mask_data, y_data, upscale_in_train, increment);
+                         upscale_in_train, increment);
    }
 #else
    if (vec_size == 4 && size % 4 == 0) {
-      VectorizedRandomGenerator<
+      VectorizedRandomGenerator<T, uint8_t,
-          T, uint8_t,
+                                4><<<grid_size, block_size, 0, stream>>>(
-          4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
          size, seed_data, dropout_prob, x_data, mask_data, y_data,
          upscale_in_train, increment);
    } else {
-      RandomGenerator<T, uint8_t><<<config.block_per_grid,
+      RandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
-                                    config.thread_per_block, 0, stream>>>(
          size, seed_data, dropout_prob, x_data, mask_data, y_data,
          upscale_in_train, increment);
    }