gpu_launch_config.h 4.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
// Used for compute gpu launch parameter
16 17 18

#pragma once

19
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
20

21
#ifdef PADDLE_WITH_CUDA
22
#include <cuda_runtime.h>
23 24 25
#else
#include <hip/hip_runtime.h>
#endif
F
feng_shuai 已提交
26

27 28 29 30
#include <stddef.h>
#include <algorithm>
#include <string>
#include <vector>
31
#include "paddle/fluid/platform/device_context.h"
32 33 34 35

namespace paddle {
namespace platform {

36
inline int DivUp(int a, int b) { return (a + b - 1) / b; }
37

F
feng_shuai 已提交
38 39 40 41 42 43 44 45 46 47 48 49
#ifdef WITH_NV_JETSON
// The number of threads cannot be assigned 1024 in some cases when the device
// is nano or tx2 .
inline void ChangeThreadNum(const platform::CUDADeviceContext& context,
                            int* num_thread, int alternative_num_thread = 512) {
  if (context.GetComputeCapability() == 53 ||
      context.GetComputeCapability() == 62) {
    *num_thread = alternative_num_thread;
  }
}
#endif

50 51 52 53
struct GpuLaunchConfig {
  dim3 theory_thread_count = dim3(1, 1, 1);
  dim3 thread_per_block = dim3(1, 1, 1);
  dim3 block_per_grid = dim3(1, 1, 1);
54
  int compute_capability = 0;
55 56
};

57
inline GpuLaunchConfig GetGpuLaunchConfig1D(
58
    const platform::CUDADeviceContext& context, int64_t element_count,
59
#ifdef PADDLE_WITH_HIP
60
    // HIP will throw GPU memory access fault if threads > 256
61 62
    int max_threads = 256) {
#else
63
    int max_threads = 1024) {
64
#endif
65 66 67 68 69
  PADDLE_ENFORCE_GT(element_count, 0,
                    platform::errors::InvalidArgument(
                        "element count should be greater than 0,"
                        " but received value is: %d.",
                        element_count));
70 71 72

  const int theory_thread_count = element_count;
  // Get Max threads in all SM
73
  int max_physical_threads = context.GetMaxPhysicalThreadCount();
74 75
  int sm = context.GetSMCount();

76
  // Compute physical threads we need, should small than max sm threads
77
  const int physical_thread_count =
F
feng_shuai 已提交
78 79 80 81 82 83 84 85 86 87
      (std::min)(max_physical_threads, theory_thread_count);

  // Get compute_capability
  const int capability = context.GetComputeCapability();

#ifdef WITH_NV_JETSON
  if (capability == 53 || capability == 62) {
    max_threads = 512;
  }
#endif
88 89

  // Need get from device
90
  const int thread_per_block =
F
feng_shuai 已提交
91
      (std::min)(max_threads, context.GetMaxThreadsPerBlock());
92
  const int block_count =
F
feng_shuai 已提交
93
      (std::min)(DivUp(physical_thread_count, thread_per_block), sm);
94

95 96 97 98
  GpuLaunchConfig config;
  config.theory_thread_count.x = theory_thread_count;
  config.thread_per_block.x = thread_per_block;
  config.block_per_grid.x = block_count;
99
  config.compute_capability = capability;
100 101 102 103
  return config;
}

inline GpuLaunchConfig GetGpuLaunchConfig2D(
104 105 106 107 108 109 110 111 112
    const platform::CUDADeviceContext& context, int x_dim, int y_dim) {
  PADDLE_ENFORCE_GT(x_dim, 0, platform::errors::InvalidArgument(
                                  "x dim number should greater than 0,"
                                  " but received value is: %d",
                                  x_dim));
  PADDLE_ENFORCE_GT(y_dim, 0, platform::errors::InvalidArgument(
                                  "y dim number should greater than 0,"
                                  " but received value is: %d",
                                  y_dim));
113 114

  const int kThreadsPerBlock = 256;
F
feng_shuai 已提交
115 116
  int block_cols = (std::min)(x_dim, kThreadsPerBlock);
  int block_rows = (std::max)(kThreadsPerBlock / block_cols, 1);
117 118

  int max_physical_threads = context.GetMaxPhysicalThreadCount();
F
feng_shuai 已提交
119
  const int max_blocks = (std::max)(max_physical_threads / kThreadsPerBlock, 1);
120

121 122
  GpuLaunchConfig config;
  // Noticed, block size is not align to 32, if needed do it yourself.
123
  config.theory_thread_count = dim3(x_dim, y_dim, 1);
124 125
  config.thread_per_block = dim3(block_cols, block_rows, 1);

F
feng_shuai 已提交
126 127 128
  int grid_x = (std::min)(DivUp(x_dim, block_cols), max_blocks);
  int grid_y =
      (std::min)(max_blocks / grid_x, (std::max)(y_dim / block_rows, 1));
129 130

  config.block_per_grid = dim3(grid_x, grid_y, 1);
131 132 133
  return config;
}

134 135
// TODO(wangchaochaohu): 3D will add later

136 137
}  // namespace platform
}  // namespace paddle
138 139

#endif