gpu_info.cc 22.0 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
L
liaogang 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/platform/gpu_info.h"
S
sneaxiy 已提交
16
#include <cstdlib>
17 18
#include <mutex>
#include <vector>
L
liaogang 已提交
19

20
#include "gflags/gflags.h"
21
#include "paddle/fluid/platform/cuda_device_guard.h"
22 23 24
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/miopen.h"
#else
25
#include "paddle/fluid/platform/cuda_graph.h"
26
#include "paddle/fluid/platform/dynload/cudnn.h"
27
#endif
28
#include "paddle/fluid/memory/malloc.h"
Y
Yi Wang 已提交
29
#include "paddle/fluid/platform/enforce.h"
30 31
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
H
hutuxian 已提交
32
#include "paddle/fluid/platform/monitor.h"
33
#include "paddle/fluid/platform/place.h"
34
#include "paddle/fluid/string/split.h"
L
liaogang 已提交
35

36 37 38 39 40
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
41
DECLARE_uint64(gpu_memory_limit_mb);
42

Z
zhhsplendid 已提交
43 44
constexpr static float fraction_reserve_gpu_memory = 0.05f;

45 46 47 48
static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props;

H
hutuxian 已提交
49
USE_GPU_MEM_STAT;
L
liaogang 已提交
50 51 52
namespace paddle {
namespace platform {

53 54
int CudnnVersion() {
  if (!dynload::HasCUDNN()) return -1;
55

56 57 58 59 60 61
#ifdef PADDLE_WITH_HIP
  size_t version_major, version_minor, version_patch;
  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
      &version_major, &version_minor, &version_patch));
  return version_major * 100 + version_minor * 10 + version_patch;
#else
62
  return dynload::cudnnGetVersion();
63
#endif
64
}
S
sneaxiy 已提交
65
static int GetCUDADeviceCountImpl() {
66
  int driverVersion = 0;
67 68 69
#ifdef PADDLE_WITH_HIP
  hipError_t status = hipDriverGetVersion(&driverVersion);
#else
70
  cudaError_t status = cudaDriverGetVersion(&driverVersion);
71
#endif
72

73
  if (!(status == gpuSuccess && driverVersion != 0)) {
74
    // No GPU driver
75
    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
76 77 78
    return 0;
  }

79 80 81
#ifdef PADDLE_WITH_HIP
  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
#else
S
sneaxiy 已提交
82
  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
83
#endif
S
sneaxiy 已提交
84 85
  if (cuda_visible_devices != nullptr) {
    std::string cuda_visible_devices_str(cuda_visible_devices);
86 87 88 89 90 91 92 93 94 95
    if (!cuda_visible_devices_str.empty()) {
      cuda_visible_devices_str.erase(
          0, cuda_visible_devices_str.find_first_not_of('\''));
      cuda_visible_devices_str.erase(
          cuda_visible_devices_str.find_last_not_of('\'') + 1);
      cuda_visible_devices_str.erase(
          0, cuda_visible_devices_str.find_first_not_of('\"'));
      cuda_visible_devices_str.erase(
          cuda_visible_devices_str.find_last_not_of('\"') + 1);
    }
S
sneaxiy 已提交
96 97 98
    if (std::all_of(cuda_visible_devices_str.begin(),
                    cuda_visible_devices_str.end(),
                    [](char ch) { return ch == ' '; })) {
99 100
      VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be "
                 "empty. No GPU detected.";
S
sneaxiy 已提交
101 102 103
      return 0;
    }
  }
L
liaogang 已提交
104
  int count;
105 106 107
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count));
#else
108
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
109
#endif
L
liaogang 已提交
110 111 112
  return count;
}

S
sneaxiy 已提交
113
int GetCUDADeviceCount() {
114
  // cache the count
S
sneaxiy 已提交
115 116 117 118
  static auto dev_cnt = GetCUDADeviceCountImpl();
  return dev_cnt;
}

119 120 121 122
/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
faster way to query device properties. You can see details in
https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
*/
123
int GetCUDAComputeCapability(int id) {
124 125 126 127 128
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
129 130
  int major, minor;

131 132 133 134 135 136
#ifdef PADDLE_WITH_HIP
  auto major_error_code = hipDeviceGetAttribute(
      &major, hipDeviceAttributeComputeCapabilityMajor, id);
  auto minor_error_code = hipDeviceGetAttribute(
      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
#else
137 138 139 140
  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
  auto minor_error_code =
      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
141
#endif
142 143
  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
144 145 146
#ifdef PADDLE_WITH_HIP
  return major * 100 + minor;
#else
147
  return major * 10 + minor;
148
#endif
149 150
}

151
dim3 GetGpuMaxGridDimSize(int id) {
152 153 154 155 156
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
157 158
  dim3 ret;
  int size;
159 160 161 162
#ifdef PADDLE_WITH_HIP
  auto error_code_x =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
#else
163
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
164
#endif
165
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
166 167
  ret.x = size;

168 169 170 171
#ifdef PADDLE_WITH_HIP
  auto error_code_y =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
#else
172
  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
173
#endif
174
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
175 176
  ret.y = size;

177 178 179 180
#ifdef PADDLE_WITH_HIP
  auto error_code_z =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
#else
181
  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
182
#endif
183
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
184 185 186 187
  ret.z = size;
  return ret;
}

C
chengduo 已提交
188
int GetCUDARuntimeVersion(int id) {
189 190 191 192 193
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduo 已提交
194
  int runtime_version = 0;
195 196 197
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version));
#else
198
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
199
#endif
C
chengduo 已提交
200 201 202 203
  return runtime_version;
}

int GetCUDADriverVersion(int id) {
204 205 206 207 208
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduo 已提交
209
  int driver_version = 0;
210 211 212
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version));
#else
213
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
214
#endif
C
chengduo 已提交
215 216 217
  return driver_version;
}

218
bool TensorCoreAvailable() {
219
#if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000
220 221 222 223 224 225 226 227
  int device = GetCurrentDeviceId();
  int driver_version = GetCUDAComputeCapability(device);
  return driver_version >= 70;
#else
  return false;
#endif
}

C
chengduoZH 已提交
228
int GetCUDAMultiProcessors(int id) {
229 230 231 232 233
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduoZH 已提交
234
  int count;
235 236 237 238
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
#else
239 240
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
241
#endif
C
chengduoZH 已提交
242 243 244 245
  return count;
}

int GetCUDAMaxThreadsPerMultiProcessor(int id) {
246 247 248 249 250
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduoZH 已提交
251
  int count;
252 253 254 255
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute(
      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
#else
256 257
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
258
#endif
C
chengduoZH 已提交
259 260 261
  return count;
}

262
int GetCUDAMaxThreadsPerBlock(int id) {
263 264 265 266 267
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
268
  int count;
269 270 271 272
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
#else
273 274
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
275
#endif
276 277 278
  return count;
}

L
liaogang 已提交
279 280
int GetCurrentDeviceId() {
  int device_id;
281 282 283
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id));
#else
284
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
285
#endif
L
liaogang 已提交
286 287 288
  return device_id;
}

289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
  // use user specified GPUs in single-node multi-process mode.
  std::vector<int> devices;
  if (!FLAGS_selected_gpus.empty()) {
    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
    for (auto id : devices_str) {
      devices.push_back(atoi(id.c_str()));
    }
  } else {
    int count = GetCUDADeviceCount();
    for (int i = 0; i < count; ++i) {
      devices.push_back(i);
    }
  }
  return devices;
}

307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
const gpuDeviceProp &GetDeviceProperties(int id) {
  std::call_once(g_device_props_size_init_flag, [&] {
    int gpu_num = 0;
    gpu_num = platform::GetCUDADeviceCount();
    g_device_props_init_flags.resize(gpu_num);
    g_device_props.resize(gpu_num);
    for (int i = 0; i < gpu_num; ++i) {
      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
    }
  });

  if (id == -1) {
    id = platform::GetCurrentDeviceId();
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
    PADDLE_THROW(platform::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
        "appropriate device again!",
        id, static_cast<int>(g_device_props.size()),
        static_cast<int>(g_device_props.size())));
  }

  std::call_once(*(g_device_props_init_flags[id]), [&] {
#ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaGetDeviceProperties(&g_device_props[id], id));
#else
    PADDLE_ENFORCE_CUDA_SUCCESS(
      hipGetDeviceProperties(&g_device_props[id], id));
#endif
  });

  return g_device_props[id];
}

L
liaogang 已提交
345
void SetDeviceId(int id) {
Q
qijun 已提交
346
  // TODO(qijun): find a better way to cache the cuda device count
347 348 349 350 351
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
352 353 354
#ifdef PADDLE_WITH_HIP
  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
#else
L
Leo Chen 已提交
355
  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
356
#endif
L
liaogang 已提交
357 358
}

359
void GpuMemoryUsage(size_t *available, size_t *total) {
360 361 362
  size_t actual_available, actual_total;
  RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total,
                         platform::GetCurrentDeviceId());
L
liaogang 已提交
363 364
}

365
size_t GpuAvailableMemToAlloc() {
L
liaogang 已提交
366 367
  size_t total = 0;
  size_t available = 0;
368
  GpuMemoryUsage(&available, &total);
369 370
  size_t reserving =
      static_cast<size_t>(fraction_reserve_gpu_memory * available);
371
  // If available size is less than minimum chunk size, no usable memory exists
372
  size_t available_to_alloc = available - reserving;
373
  size_t min_chunk_size = GpuMinChunkSize();
374 375 376
  if (available_to_alloc < min_chunk_size) {
    available_to_alloc = 0;
  }
377 378 379
  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
  return available_to_alloc;
Z
zhhsplendid 已提交
380 381
}

382 383 384
size_t GpuMaxAllocSize() {
  return std::max(GpuInitAllocSize(), GpuReallocSize());
}
Z
zhhsplendid 已提交
385

386 387
static size_t GpuAllocSize(bool realloc) {
  size_t available_to_alloc = GpuAvailableMemToAlloc();
G
GaoWei8 已提交
388 389 390
  PADDLE_ENFORCE_GT(
      available_to_alloc, 0,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
391 392 393 394 395 396 397
  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
  // allocated by fraction
  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                           : FLAGS_initial_gpu_memory_in_mb;
  size_t alloc_bytes =
      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
                                           FLAGS_fraction_of_gpu_memory_to_use);
G
GaoWei8 已提交
398 399 400
  PADDLE_ENFORCE_GE(
      available_to_alloc, alloc_bytes,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
401 402 403 404
  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
           << " MiB, is it Re-alloc: " << realloc;
  return alloc_bytes;
}
Z
zhhsplendid 已提交
405

406
size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
Z
zhhsplendid 已提交
407

408
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
L
liaogang 已提交
409

L
liaogang 已提交
410 411 412 413 414 415
size_t GpuMinChunkSize() {
  // Allow to allocate the minimum chunk size is 256 bytes.
  return 1 << 8;
}

size_t GpuMaxChunkSize() {
416 417 418
  size_t max_chunk_size = GpuMaxAllocSize();
  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
  return max_chunk_size;
L
liaogang 已提交
419 420
}

421 422 423 424 425 426
#ifdef PADDLE_WITH_HIP
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum hipMemcpyKind kind, hipStream_t stream) {
  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
}
#else
L
liaogang 已提交
427 428
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
429
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
L
liaogang 已提交
430
}
431
#endif
L
liaogang 已提交
432

433 434 435 436 437 438
#ifdef PADDLE_WITH_HIP
void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum hipMemcpyKind kind) {
  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind));
}
#else
439 440
void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum cudaMemcpyKind kind) {
441
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
442
}
443
#endif
444 445

void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
446 447 448 449 450
                        int src_device, size_t count, gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
#else
451 452
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
453
#endif
454 455 456 457
}

void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count) {
458 459 460 461
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipMemcpyPeer(dst, dst_device, src, src_device, count));
#else
462 463
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
464
#endif
L
liaogang 已提交
465
}
D
dzhwinter 已提交
466

467 468 469 470
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream));
#else
471
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
472
#endif
D
dzhwinter 已提交
473
}
474

475 476 477 478
void GpuStreamSync(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
479
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
480
#endif
石晓伟 已提交
481 482
}

483 484 485 486 487 488
static void RaiseNonOutOfMemoryError(gpuError_t *status) {
#ifdef PADDLE_WITH_HIP
  if (*status == hipErrorOutOfMemory) {
    *status = hipSuccess;
  }
#else
489 490 491
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
492
#endif
493 494
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);

495 496 497 498 499 500
#ifdef PADDLE_WITH_HIP
  *status = hipGetLastError();
  if (*status == hipErrorOutOfMemory) {
    *status = hipSuccess;
  }
#else
501 502 503 504
  *status = cudaGetLastError();
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
505
#endif
506 507
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
石晓伟 已提交
508

509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
class RecordedCudaMallocHelper {
 private:
  explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0)
      : dev_id_(dev_id), limit_size_(limit_size) {
    if (NeedRecord()) {
      mtx_.reset(new std::mutex());
    }
  }

  DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper);

 public:
  static RecordedCudaMallocHelper *Instance(int dev_id) {
    std::call_once(once_flag_, [] {
      int dev_cnt = GetCUDADeviceCount();
      instances_.reserve(dev_cnt);
      for (int i = 0; i < dev_cnt; ++i) {
        instances_.emplace_back(
            new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
      }
    });

    PADDLE_ENFORCE_GE(
        dev_id, 0,
        platform::errors::OutOfRange(
G
GaoWei8 已提交
534
            "Device id must be not less than 0, but got %d.", dev_id));
535 536
    PADDLE_ENFORCE_LT(
        dev_id, instances_.size(),
G
GaoWei8 已提交
537
        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
538 539 540 541 542 543 544 545 546
                                     dev_id, instances_.size()));
    return instances_[dev_id].get();
  }

  /**
   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
   * or cudaSuccess would be returned, and the cudaGetLastError() flag
   * would be clear.
   */
547
  gpuError_t Malloc(void **ptr, size_t size) {
548
    LockGuardPtr<std::mutex> lock(mtx_);
549
    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
550 551 552
#ifdef PADDLE_WITH_HIP
      return hipErrorOutOfMemory;
#else
553
      return cudaErrorMemoryAllocation;
554
#endif
555 556 557
    }

    CUDADeviceGuard guard(dev_id_);
558 559 560
#ifdef PADDLE_WITH_HIP
    auto result = hipMalloc(ptr, size);
#else
561
    CUDAGraphCaptureModeGuard capture_mode_guard;
562
    auto result = cudaMalloc(ptr, size);
563 564
#endif
    if (result == gpuSuccess) {
565
      cur_size_.fetch_add(size);
H
hutuxian 已提交
566
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
567
      return gpuSuccess;
568 569
    } else {
      RaiseNonOutOfMemoryError(&result);
570 571 572 573 574 575
// Non out of memory error would be raised inside
// RaiseNonOutOfMemoryError. Therefore, we can
// return cudaErrorMemoryAllocation directly here.
#ifdef PADDLE_WITH_HIP
      return hipErrorOutOfMemory;
#else
576
      return cudaErrorMemoryAllocation;
577
#endif
578 579 580 581 582 583 584 585 586 587 588 589 590 591
    }
  }

  /**
   * Free gpu memory. Usually, free is not allowed to raise error.
   * If it does raise error, the process should be crashed.
   */
  void Free(void *ptr, size_t size) {
    // Purposefully allow cudaErrorCudartUnloading, because
    // that is returned if you ever call cudaFree after the
    // driver has already shutdown. This happens only if the
    // process is terminating, in which case we don't care if
    // cudaFree succeeds.
    CUDADeviceGuard guard(dev_id_);
592 593 594 595
#ifdef PADDLE_WITH_HIP
    auto err = hipFree(ptr);
    if (err != hipErrorDeinitialized) {
#else
596 597
    auto err = cudaFree(ptr);
    if (err != cudaErrorCudartUnloading) {
598
#endif
599
      PADDLE_ENFORCE_CUDA_SUCCESS(err);
600
      cur_size_.fetch_sub(size);
H
hutuxian 已提交
601
      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
602
    } else {
603 604 605
#ifdef PADDLE_WITH_HIP
      hipGetLastError();  // clear the error flag when hipErrorDeinitialized
#else
606
      cudaGetLastError();  // clear the error flag when cudaErrorCudartUnloading
607
#endif
608 609 610 611 612 613 614
    }
  }

  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
                  size_t *actual_total) {
    {
      CUDADeviceGuard guard(dev_id_);
615 616 617
#ifdef PADDLE_WITH_HIP
      auto result = hipMemGetInfo(actual_avail, actual_total);
#else
618
      auto result = cudaMemGetInfo(actual_avail, actual_total);
619 620
#endif
      if (result != gpuSuccess) {
621 622 623 624 625 626 627
        *actual_avail = 0;
      }
      RaiseNonOutOfMemoryError(&result);
    }

    if (NeedRecord()) {
      std::lock_guard<std::mutex> guard(*mtx_);
628
      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
629 630 631 632 633 634 635 636 637 638 639
      *total = std::min(*actual_total, limit_size_);
      return *total < *actual_total;
    } else {
      *avail = *actual_avail;
      *total = *actual_total;
      return false;
    }
  }

  inline bool NeedRecord() const { return limit_size_ != 0; }

640
  uint64_t RecordedSize() const { return cur_size_.load(); }
641 642 643 644 645 646

  uint64_t LimitSize() const { return limit_size_; }

 private:
  const int dev_id_;
  const uint64_t limit_size_;
647
  std::atomic<uint64_t> cur_size_{0};
648 649 650 651 652

  mutable std::unique_ptr<std::mutex> mtx_;

  static std::once_flag once_flag_;
  static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
653
};  // NOLINT
654 655 656 657 658

std::once_flag RecordedCudaMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
    RecordedCudaMallocHelper::instances_;

659
gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
  return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
}

void RecordedCudaFree(void *p, size_t size, int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
}

bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
                            size_t *actual_total, int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
      avail, total, actual_avail, actual_total);
}

uint64_t RecordedCudaMallocSize(int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize();
}

bool IsCudaMallocRecorded(int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
}

681 682 683 684 685 686 687
void EmptyCache(void) {
  std::vector<int> devices = GetSelectedDevices();
  for (auto device : devices) {
    memory::Release(CUDAPlace(device));
  }
}

L
liaogang 已提交
688 689
}  // namespace platform
}  // namespace paddle