Adjust warpper of gpu_lanuch_config (#38654)

* first commit * fix wrong filename * fix the wrong spell name * fix gpu config warper * modify according to pr advices * fix GpuLauchConfig1D api bugs * change the config for dropout grad * fix bugs * modification according to pr advices * modification according to pr advices

Adjust warpper of gpu_lanuch_config (#38654)
* first commit * fix wrong filename * fix the wrong spell name * fix gpu config warper * modify according to pr advices * fix GpuLauchConfig1D api bugs * change the config for dropout grad * fix bugs * modification according to pr advices * modification according to pr advices
f5166284 · limingshu · GitHub · 0d8d1e0e · f5166284 · f5166284
12 changed file
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -472,8 +472,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
    grid_sizes.gw = gw;
    grid_sizes.input_chans = input_chans;

-    platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(
-        ctx.cuda_device_context(), grid_count, 512);
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), grid_count);

    BilateralSliceCudaGridGradKernel<
        T><<<config.block_per_grid, config.thread_per_block, 0,
@@ -481,8 +481,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
        grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
        has_offset, grid_count, output_chans);

-    config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(),
-                                            guide_count, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), guide_count);

    BilateralSliceCudaGuideGradKernel<
        T><<<config.block_per_grid, config.thread_per_block, 0,
@@ -490,8 +490,8 @@ class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
        guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
        grid_sizes, has_offset, guide_count, output_chans);

-    config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(),
-                                            input_count, 512);
+    config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_count);

    BilateralSliceCudaInputGradKernel<
        T><<<config.block_per_grid, config.thread_per_block, 0,

--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -193,12 +193,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
    // VectorizedRandomGenerator use curand_uniform4, so we only support
    // vec_size is 4;
    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
-    int grid_size =
-        ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
-
+    auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
    auto offset =
-        ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size;
+        ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;

    GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
                            &seed_data, &increment);
@@ -206,23 +203,25 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 #ifdef __HIPCC__
    if (vec_size == 4 && size % 4 == 0) {
      hipLaunchKernelGGL(
-          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), grid_size,
-          block_size, 0, stream, size, seed_data, dropout_prob, x_data,
-          mask_data, y_data, upscale_in_train, increment);
+          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
+          gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
+          seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
+          increment);
    } else {
      hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
-                         grid_size, block_size, 0, stream, size, seed_data,
-                         dropout_prob, x_data, mask_data, y_data,
-                         upscale_in_train, increment);
+                         gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
+                         stream, size, seed_data, dropout_prob, x_data,
+                         mask_data, y_data, upscale_in_train, increment);
    }
 #else
    if (vec_size == 4 && size % 4 == 0) {
-      VectorizedRandomGenerator<T, uint8_t,
-                                4><<<grid_size, block_size, 0, stream>>>(
+      VectorizedRandomGenerator<T, uint8_t, 4><<<
+          gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
          size, seed_data, dropout_prob, x_data, mask_data, y_data,
          upscale_in_train, increment);
    } else {
-      RandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
+      RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
+                                    gpu_config.thread_per_block, 0, stream>>>(
          size, seed_data, dropout_prob, x_data, mask_data, y_data,
          upscale_in_train, increment);
    }
@@ -265,7 +264,7 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
          auto factor = static_cast<T>(1.0f / (1.0f - dropout_prob));
          auto stream = dev_ctx.stream();
          platform::GpuLaunchConfig config =
-              platform::GetGpuLaunchConfig1D(dev_ctx, size);
+              platform::GetGpuLaunchConfig1D(dev_ctx, size, vec_size);
          DropoutGradCUDAKernel<
              T, uint8_t,
              4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -128,10 +128,10 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
  } else if (dx_data != dout_data && dy_data != dout_data) {
    auto size = x->numel();
    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + ELEMENTWISE_BLOCK_SIZE - 1) /
-                 ELEMENTWISE_BLOCK_SIZE,
+        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
+                 PREDEFINED_BLOCK_SIZE,
             1);
    SimpleElemwiseAddGradCUDAKernel<
        T><<<grid_size, block_size, 0,

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -73,10 +73,10 @@ default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
    if (dy->dims() == dout->dims()) {
      if (dy_data != dout_data) {
-        dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
        auto size = dy->numel();
-        dim3 grid_size = dim3(
-            (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
+        dim3 grid_size =
+            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
        SimpleElemwiseSubGradCUDAKernel<T><<<
            grid_size, block_size, 0,
            ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
@@ -100,10 +100,10 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
                     const framework::Tensor* out,
                     const framework::Tensor* dout, framework::Tensor* dx,
                     framework::Tensor* dy) {
-  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
  auto size = x->numel();
  dim3 grid_size =
-      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
+      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
  SimpleElemwiseSubGradCUDAKernel<
      T><<<grid_size, block_size, 0,
           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(

--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -23,6 +23,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_sample_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"

 namespace paddle {

--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/beam_search.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -16,17 +16,10 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"

-#ifdef __HIPCC__
-#define POOLING_BLOCK_SIZE 256
-#else
-#define POOLING_BLOCK_SIZE 512
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -97,22 +90,6 @@ __device__ void OffsetPreparationFor4Dimension(
  }
 }

-int GetThreadsPerBlock(const platform::CUDADeviceContext& ctx,
-                       int threads_per_block, int64_t numel) {
-  int sm_count = ctx.GetSMCount();
-  if (numel / (sm_count << 1) < threads_per_block) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about twice of SM, to acquire better performance.
-    threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 1));
-  } else if (numel / (sm_count << 2) < threads_per_block) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about 4 times of SM, to acquire better performance.
-    threads_per_block = platform::RoundToPowerOfTwo(numel / (sm_count << 2));
-  }
-  // Number of threads per block shall be larger than 64.
-  return std::max(64, threads_per_block);
-}
-
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(
    const int nthreads, const T* input_data, const int channels,
@@ -491,14 +468,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
-    int grids = (nthreads + blocks - 1) / blocks;
-
    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
        input_channels, input_width, input_height, ksize_width, ksize_height,
        stride_width, stride_height);

-    KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
+    auto config = GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<
+        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
        nthreads, input_data, output_data, output_grad_data, output_width,
        output_height, input_width, input_height, ksize_width, ksize_height,
        stride_width, stride_height, padding_width, padding_height,
@@ -541,14 +517,13 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * input_channels * input_height * input_width;
-    int blocks = GetThreadsPerBlock(context, POOLING_BLOCK_SIZE, nthreads);
-    int grids = (nthreads + blocks - 1) / blocks;
-
    auto pool_divmods = FastDivModForPoolingWithMoreStaff(
        input_channels, input_width, input_height, ksize_width, ksize_height,
        stride_width, stride_height);

-    KernelPool2DGrad<T, PoolProcess><<<grids, blocks, 0, context.stream()>>>(
+    auto config = GetGpuLaunchConfig1D(context, nthreads);
+    KernelPool2DGrad<T, PoolProcess><<<
+        config.block_per_grid, config.thread_per_block, 0, context.stream()>>>(
        nthreads, input_data, output_data, output_grad_data, output_width,
        output_height, input_width, input_height, ksize_width, ksize_height,
        stride_width, stride_height, padding_width, padding_height,

--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -26,22 +26,6 @@ namespace platform {
 #define CREATE_SHFL_MASK(mask, predicate) \
  mask = __ballot_sync(FULL_WARP_MASK, (predicate))

-inline static int RoundToPowerOfTwo(int dim) {
-  if (dim > 512) {
-    return 1024;
-  } else if (dim > 256) {
-    return 512;
-  } else if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
  case (dim): {                            \
    constexpr auto kPowerOfTwoDim = (dim); \

--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Used for compute gpu launch parameter
+// Used for compute gpu launch parameter config

 #pragma once

@@ -30,11 +30,36 @@
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"

+#ifdef __HIPCC__
+// HIP results in error or nan if > 256
+#define PREDEFINED_BLOCK_SIZE 256
+#else
+/* CUDA performs better as thread_per_block
+   num is between [64, 512] */
+#define PREDEFINED_BLOCK_SIZE 512
+#endif
+
 namespace paddle {
 namespace platform {

 inline int DivUp(int a, int b) { return (a + b - 1) / b; }

+/* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+   for round integer value into next highest power of 2. */
+static inline int RoundToPowerOfTwo(int n) {
+  n--;
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+#ifdef __HIPCC__
+  return std::min(256, std::max(32, (n + 1)));
+#else
+  return std::min(1024, std::max(32, (n + 1)));
+#endif
+}
+
 #ifdef WITH_NV_JETSON
 // The number of threads cannot be assigned 1024 in some cases when the device
 // is nano or tx2 .
@@ -48,54 +73,64 @@ inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
 #endif

 struct GpuLaunchConfig {
-  dim3 theory_thread_count = dim3(1, 1, 1);
+ public:
+  GpuLaunchConfig() {}
+
+  size_t GetThreadNum() const { return GetBlockSize() * GetGridSize(); }
+
+  size_t GetGridSize() const {
+    return block_per_grid.x * block_per_grid.y * block_per_grid.z;
+  }
+
+  size_t GetBlockSize() const {
+    return thread_per_block.x * thread_per_block.y * thread_per_block.z;
+  }
+
+  int compute_capability = 0;
  dim3 thread_per_block = dim3(1, 1, 1);
  dim3 block_per_grid = dim3(1, 1, 1);
-  int compute_capability = 0;
 };

+/* According to NVIDIA, if number of threads per block is 64/128/256/512,
+  * cuda performs better. And number of blocks should be greater (at least
+  * 2x~4x) than number of SMs. Hence, SM count is took into account within
+  * this function to determine the right number of threads per block. */
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
-    const platform::CUDADeviceContext& context, int64_t element_count,
-#ifdef PADDLE_WITH_HIP
-    // HIP will throw GPU memory access fault if threads > 256
-    int max_threads = 256) {
-#else
-    int max_threads = 1024) {
-#endif
-  PADDLE_ENFORCE_GT(element_count, 0,
-                    platform::errors::InvalidArgument(
-                        "element count should be greater than 0,"
+    const platform::CUDADeviceContext& context, int64_t numel,
+    int vec_size = 1) {
+  PADDLE_ENFORCE_GT(numel, 0, platform::errors::InvalidArgument(
+                                  "element quantity should be greater than 0,"
                                  " but received value is: %d.",
-                        element_count));
-
-  const int theory_thread_count = element_count;
-  // Get Max threads in all SM
-  int max_physical_threads = context.GetMaxPhysicalThreadCount();
-  int sm = context.GetSMCount();
-
-  // Compute physical threads we need, should small than max sm threads
-  const int physical_thread_count =
-      (std::min)(max_physical_threads, theory_thread_count);
-
+                                  numel));
  // Get compute_capability
  const int capability = context.GetComputeCapability();
-
+  /* If thread number per block is 64/128/256/512, cuda performs better.*/
+  int limit_threads =
+      std::min(PREDEFINED_BLOCK_SIZE, context.GetMaxThreadsPerBlock());
 #ifdef WITH_NV_JETSON
  if (capability == 53 || capability == 62) {
-    max_threads = 512;
+    limit_threads = 512;
  }
 #endif
-
-  // Need get from device
-  const int thread_per_block =
-      (std::min)(max_threads, context.GetMaxThreadsPerBlock());
-  const int block_count =
-      (std::min)(DivUp(physical_thread_count, thread_per_block), sm);
+  int threads = limit_threads;
+  int sm_count = context.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < limit_threads) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = RoundToPowerOfTwo(active_threads_num / (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  threads = std::max(64, threads);
+  int blocks = DivUp(DivUp(numel, vec_size), threads);

  GpuLaunchConfig config;
-  config.theory_thread_count.x = theory_thread_count;
-  config.thread_per_block.x = thread_per_block;
-  config.block_per_grid.x = block_count;
+  config.thread_per_block.x = threads;
+  config.block_per_grid.x = blocks;
  config.compute_capability = capability;
  return config;
 }
@@ -120,7 +155,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(

  GpuLaunchConfig config;
  // Noticed, block size is not align to 32, if needed do it yourself.
-  config.theory_thread_count = dim3(x_dim, y_dim, 1);
  config.thread_per_block = dim3(block_cols, block_rows, 1);

  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);

--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -24,19 +24,6 @@ namespace platform {

 #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))

-inline static int RoundToPowerOfTwo(int dim) {
-  // HIP results in error or nan if > 256
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-}
-
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
  case (dim): {                            \
    constexpr auto kPowerOfTwoDim = (dim); \

--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -16,9 +16,9 @@ limitations under the License. */

 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"

 namespace pten {

@@ -239,18 +239,15 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
                              VecSize><<<grid_size, block_size, 0, stream>>>(
      ins_data, outs_data, numel, main_offset, func);
 #else
-  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
-  int grid_size =
-      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  auto gpu_config = GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  int main_offset = (numel / (VecSize * gpu_config.GetBlockSize())) * VecSize *
+                    gpu_config.GetBlockSize();
  auto stream = ctx.stream();
-  VectorizedElementwiseKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              NumOuts,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, outs_data, numel, main_offset, func);
+  VectorizedElementwiseKernel<InT, OutT, Functor, Arity, NumOuts, VecSize><<<
+      gpu_config.block_per_grid,
+      gpu_config.thread_per_block,
+      0,
+      stream>>>(ins_data, outs_data, numel, main_offset, func);
 #endif
 }