gpu_launch_param_config.h 3.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

// Used for compute gpu launch parameter

#pragma once


#include <cuda_runtime.h>
#include <stddef.h>
#include <algorithm>
#include <string>
#include <vector>

namespace paddle {
namespace platform {

inline int DivUp(int a, int b) { return (a + b - 1) / b; }

struct GpuLaunchParamConfig {
  dim3 theory_thread_count = dim3(0, 0, 0);
  dim3 thread_per_block = dim3(0, 0, 0);
  dim3 block_per_grid = dim3(0, 0, 0);

inline GpuLaunchParamConfig GetGpuLaunchConfig1D(
    const platform::CUDADeviceContext& context, int element_count) {
  PADDLE_ENFORCE_GT(element_count, 0, platform::errors::InvalidArgument(
                                          "element count should greater than 0,"
GaoWei8 已提交
                                          " but received value is %d.",
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

  const int theory_thread_count = element_count;
  // Get Max threads in all SM
  int max_pyhsical_threads = context.GetMaxPhysicalThreadCount();
  int sm = context.GetSMCount();

  // Compute pyhsical threads we need, should small than max sm threads
  const int physical_thread_count =
      std::min(max_pyhsical_threads, theory_thread_count);

  // Need get from device
  const int thread_per_block = std::min(1024, context.GetMaxThreadsPerBlock());
  // Suppose block count small than factor * sm, factor is a experiments value.
  int factor = 4;
  const int block_count =
      std::min(DivUp(physical_thread_count, thread_per_block), factor * sm);

  GpuLaunchParamConfig config;
  config.theory_thread_count.x = theory_thread_count;
  config.thread_per_block.x = thread_per_block;
  config.block_per_grid.x = block_count;
  return config;

inline GpuLaunchParamConfig GetGpuLaunchConfig2D(
    const platform::CUDADeviceContext& context, int xdim, int ydim) {
  PADDLE_ENFORCE_GT(xdim, 0, platform::errors::InvalidArgument(
                                 "x dim number should greater than 0,"
                                 " but received value is:%d",
  PADDLE_ENFORCE_GT(ydim, 0, platform::errors::InvalidArgument(
                                 "y dim number should greater than 0,"
                                 " but received value is:%d",

  const int kThreadsPerBlock = 256;
  int block_cols = std::min(xdim, kThreadsPerBlock);
  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);

  int max_physical_threads = context.GetMaxPhysicalThreadCount();
  const int max_blocks = std::max(max_physical_threads / kThreadsPerBlock, 1);

  GpuLaunchParamConfig config;
  // Noticed, block size is not align to 32, if needed do it yourself.
  config.theory_thread_count = dim3(xdim, ydim, 1);
  config.thread_per_block = dim3(block_cols, block_rows, 1);

  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
  int grid_y = std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1));

  config.block_per_grid = dim3(grid_x, grid_y, 1);
  return config;

// 3D will add later

}  // namespace platform
}  // namespace paddle
