gpu_info.cc 21.9 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
L
liaogang 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/platform/gpu_info.h"
S
sneaxiy 已提交
16
#include <cstdlib>
17 18
#include <mutex>
#include <vector>
L
liaogang 已提交
19

20
#include "gflags/gflags.h"
21
#include "paddle/fluid/platform/cuda_device_guard.h"
22 23 24
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/miopen.h"
#else
25
#include "paddle/fluid/platform/dynload/cudnn.h"
26
#endif
27
#include "paddle/fluid/memory/malloc.h"
Y
Yi Wang 已提交
28
#include "paddle/fluid/platform/enforce.h"
29 30
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
H
hutuxian 已提交
31
#include "paddle/fluid/platform/monitor.h"
32
#include "paddle/fluid/platform/place.h"
33
#include "paddle/fluid/string/split.h"
L
liaogang 已提交
34

35 36 37 38 39
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
40
DECLARE_uint64(gpu_memory_limit_mb);
41

Z
zhhsplendid 已提交
42 43
constexpr static float fraction_reserve_gpu_memory = 0.05f;

44 45 46 47
static std::once_flag g_device_props_size_init_flag;
static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
static std::vector<paddle::gpuDeviceProp> g_device_props;

H
hutuxian 已提交
48
USE_GPU_MEM_STAT;
L
liaogang 已提交
49 50 51
namespace paddle {
namespace platform {

52 53
int CudnnVersion() {
  if (!dynload::HasCUDNN()) return -1;
54

55 56 57 58 59 60
#ifdef PADDLE_WITH_HIP
  size_t version_major, version_minor, version_patch;
  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
      &version_major, &version_minor, &version_patch));
  return version_major * 100 + version_minor * 10 + version_patch;
#else
61
  return dynload::cudnnGetVersion();
62
#endif
63
}
S
sneaxiy 已提交
64
static int GetCUDADeviceCountImpl() {
65
  int driverVersion = 0;
66 67 68
#ifdef PADDLE_WITH_HIP
  hipError_t status = hipDriverGetVersion(&driverVersion);
#else
69
  cudaError_t status = cudaDriverGetVersion(&driverVersion);
70
#endif
71

72
  if (!(status == gpuSuccess && driverVersion != 0)) {
73
    // No GPU driver
74
    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
75 76 77
    return 0;
  }

78 79 80
#ifdef PADDLE_WITH_HIP
  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
#else
S
sneaxiy 已提交
81
  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
82
#endif
S
sneaxiy 已提交
83 84
  if (cuda_visible_devices != nullptr) {
    std::string cuda_visible_devices_str(cuda_visible_devices);
85 86 87 88 89 90 91 92 93 94
    if (!cuda_visible_devices_str.empty()) {
      cuda_visible_devices_str.erase(
          0, cuda_visible_devices_str.find_first_not_of('\''));
      cuda_visible_devices_str.erase(
          cuda_visible_devices_str.find_last_not_of('\'') + 1);
      cuda_visible_devices_str.erase(
          0, cuda_visible_devices_str.find_first_not_of('\"'));
      cuda_visible_devices_str.erase(
          cuda_visible_devices_str.find_last_not_of('\"') + 1);
    }
S
sneaxiy 已提交
95 96 97
    if (std::all_of(cuda_visible_devices_str.begin(),
                    cuda_visible_devices_str.end(),
                    [](char ch) { return ch == ' '; })) {
98 99
      VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be "
                 "empty. No GPU detected.";
S
sneaxiy 已提交
100 101 102
      return 0;
    }
  }
L
liaogang 已提交
103
  int count;
104 105 106
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count));
#else
107
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
108
#endif
L
liaogang 已提交
109 110 111
  return count;
}

S
sneaxiy 已提交
112
int GetCUDADeviceCount() {
113
  // cache the count
S
sneaxiy 已提交
114 115 116 117
  static auto dev_cnt = GetCUDADeviceCountImpl();
  return dev_cnt;
}

118 119 120 121
/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
faster way to query device properties. You can see details in
https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
*/
122
int GetCUDAComputeCapability(int id) {
123 124 125 126 127
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
128 129
  int major, minor;

130 131 132 133 134 135
#ifdef PADDLE_WITH_HIP
  auto major_error_code = hipDeviceGetAttribute(
      &major, hipDeviceAttributeComputeCapabilityMajor, id);
  auto minor_error_code = hipDeviceGetAttribute(
      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
#else
136 137 138 139
  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
  auto minor_error_code =
      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
140
#endif
141 142
  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
143 144 145
#ifdef PADDLE_WITH_HIP
  return major * 100 + minor;
#else
146
  return major * 10 + minor;
147
#endif
148 149
}

150
dim3 GetGpuMaxGridDimSize(int id) {
151 152 153 154 155
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
156 157
  dim3 ret;
  int size;
158 159 160 161
#ifdef PADDLE_WITH_HIP
  auto error_code_x =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
#else
162
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
163
#endif
164
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
165 166
  ret.x = size;

167 168 169 170
#ifdef PADDLE_WITH_HIP
  auto error_code_y =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
#else
171
  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
172
#endif
173
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
174 175
  ret.y = size;

176 177 178 179
#ifdef PADDLE_WITH_HIP
  auto error_code_z =
      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
#else
180
  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
181
#endif
182
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
183 184 185 186
  ret.z = size;
  return ret;
}

C
chengduo 已提交
187
int GetCUDARuntimeVersion(int id) {
188 189 190 191 192
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduo 已提交
193
  int runtime_version = 0;
194 195 196
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version));
#else
197
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
198
#endif
C
chengduo 已提交
199 200 201 202
  return runtime_version;
}

int GetCUDADriverVersion(int id) {
203 204 205 206 207
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduo 已提交
208
  int driver_version = 0;
209 210 211
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version));
#else
212
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
213
#endif
C
chengduo 已提交
214 215 216
  return driver_version;
}

217
bool TensorCoreAvailable() {
218
#if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000
219 220 221 222 223 224 225 226
  int device = GetCurrentDeviceId();
  int driver_version = GetCUDAComputeCapability(device);
  return driver_version >= 70;
#else
  return false;
#endif
}

C
chengduoZH 已提交
227
int GetCUDAMultiProcessors(int id) {
228 229 230 231 232
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduoZH 已提交
233
  int count;
234 235 236 237
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
#else
238 239
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
240
#endif
C
chengduoZH 已提交
241 242 243 244
  return count;
}

int GetCUDAMaxThreadsPerMultiProcessor(int id) {
245 246 247 248 249
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
C
chengduoZH 已提交
250
  int count;
251 252 253 254
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute(
      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
#else
255 256
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
257
#endif
C
chengduoZH 已提交
258 259 260
  return count;
}

261
int GetCUDAMaxThreadsPerBlock(int id) {
262 263 264 265 266
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
267
  int count;
268 269 270 271
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
#else
272 273
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
274
#endif
275 276 277
  return count;
}

L
liaogang 已提交
278 279
int GetCurrentDeviceId() {
  int device_id;
280 281 282
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id));
#else
283
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
284
#endif
L
liaogang 已提交
285 286 287
  return device_id;
}

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
  // use user specified GPUs in single-node multi-process mode.
  std::vector<int> devices;
  if (!FLAGS_selected_gpus.empty()) {
    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
    for (auto id : devices_str) {
      devices.push_back(atoi(id.c_str()));
    }
  } else {
    int count = GetCUDADeviceCount();
    for (int i = 0; i < count; ++i) {
      devices.push_back(i);
    }
  }
  return devices;
}

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
const gpuDeviceProp &GetDeviceProperties(int id) {
  std::call_once(g_device_props_size_init_flag, [&] {
    int gpu_num = 0;
    gpu_num = platform::GetCUDADeviceCount();
    g_device_props_init_flags.resize(gpu_num);
    g_device_props.resize(gpu_num);
    for (int i = 0; i < gpu_num; ++i) {
      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
    }
  });

  if (id == -1) {
    id = platform::GetCurrentDeviceId();
  }

  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
    PADDLE_THROW(platform::errors::OutOfRange(
        "The device id %d is out of range [0, %d), where %d is the number of "
        "devices on this machine. Because the device id should be greater than "
        "or equal to zero and smaller than the number of gpus. Please input "
        "appropriate device again!",
        id, static_cast<int>(g_device_props.size()),
        static_cast<int>(g_device_props.size())));
  }

  std::call_once(*(g_device_props_init_flags[id]), [&] {
#ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaGetDeviceProperties(&g_device_props[id], id));
#else
    PADDLE_ENFORCE_CUDA_SUCCESS(
      hipGetDeviceProperties(&g_device_props[id], id));
#endif
  });

  return g_device_props[id];
}

L
liaogang 已提交
344
void SetDeviceId(int id) {
Q
qijun 已提交
345
  // TODO(qijun): find a better way to cache the cuda device count
346 347 348 349 350
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
                    platform::errors::InvalidArgument(
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
351 352 353
#ifdef PADDLE_WITH_HIP
  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
#else
L
Leo Chen 已提交
354
  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
355
#endif
L
liaogang 已提交
356 357
}

358
void GpuMemoryUsage(size_t *available, size_t *total) {
359 360 361
  size_t actual_available, actual_total;
  RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total,
                         platform::GetCurrentDeviceId());
L
liaogang 已提交
362 363
}

364
size_t GpuAvailableMemToAlloc() {
L
liaogang 已提交
365 366
  size_t total = 0;
  size_t available = 0;
367
  GpuMemoryUsage(&available, &total);
368 369
  size_t reserving =
      static_cast<size_t>(fraction_reserve_gpu_memory * available);
370
  // If available size is less than minimum chunk size, no usable memory exists
371
  size_t available_to_alloc = available - reserving;
372
  size_t min_chunk_size = GpuMinChunkSize();
373 374 375
  if (available_to_alloc < min_chunk_size) {
    available_to_alloc = 0;
  }
376 377 378
  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
  return available_to_alloc;
Z
zhhsplendid 已提交
379 380
}

381 382 383
size_t GpuMaxAllocSize() {
  return std::max(GpuInitAllocSize(), GpuReallocSize());
}
Z
zhhsplendid 已提交
384

385 386
static size_t GpuAllocSize(bool realloc) {
  size_t available_to_alloc = GpuAvailableMemToAlloc();
G
GaoWei8 已提交
387 388 389
  PADDLE_ENFORCE_GT(
      available_to_alloc, 0,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
390 391 392 393 394 395 396
  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
  // allocated by fraction
  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                           : FLAGS_initial_gpu_memory_in_mb;
  size_t alloc_bytes =
      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
                                           FLAGS_fraction_of_gpu_memory_to_use);
G
GaoWei8 已提交
397 398 399
  PADDLE_ENFORCE_GE(
      available_to_alloc, alloc_bytes,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
400 401 402 403
  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
           << " MiB, is it Re-alloc: " << realloc;
  return alloc_bytes;
}
Z
zhhsplendid 已提交
404

405
size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
Z
zhhsplendid 已提交
406

407
size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
L
liaogang 已提交
408

L
liaogang 已提交
409 410 411 412 413 414
size_t GpuMinChunkSize() {
  // Allow to allocate the minimum chunk size is 256 bytes.
  return 1 << 8;
}

size_t GpuMaxChunkSize() {
415 416 417
  size_t max_chunk_size = GpuMaxAllocSize();
  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
  return max_chunk_size;
L
liaogang 已提交
418 419
}

420 421 422 423 424 425
#ifdef PADDLE_WITH_HIP
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum hipMemcpyKind kind, hipStream_t stream) {
  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
}
#else
L
liaogang 已提交
426 427
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
428
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
L
liaogang 已提交
429
}
430
#endif
L
liaogang 已提交
431

432 433 434 435 436 437
#ifdef PADDLE_WITH_HIP
void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum hipMemcpyKind kind) {
  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind));
}
#else
438 439
void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum cudaMemcpyKind kind) {
440
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
441
}
442
#endif
443 444

void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
445 446 447 448 449
                        int src_device, size_t count, gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
#else
450 451
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
452
#endif
453 454 455 456
}

void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count) {
457 458 459 460
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(
      hipMemcpyPeer(dst, dst_device, src, src_device, count));
#else
461 462
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
463
#endif
L
liaogang 已提交
464
}
D
dzhwinter 已提交
465

466 467 468 469
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream));
#else
470
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
471
#endif
D
dzhwinter 已提交
472
}
473

474 475 476 477
void GpuStreamSync(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
478
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
479
#endif
石晓伟 已提交
480 481
}

482 483 484 485 486 487
static void RaiseNonOutOfMemoryError(gpuError_t *status) {
#ifdef PADDLE_WITH_HIP
  if (*status == hipErrorOutOfMemory) {
    *status = hipSuccess;
  }
#else
488 489 490
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
491
#endif
492 493
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);

494 495 496 497 498 499
#ifdef PADDLE_WITH_HIP
  *status = hipGetLastError();
  if (*status == hipErrorOutOfMemory) {
    *status = hipSuccess;
  }
#else
500 501 502 503
  *status = cudaGetLastError();
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
504
#endif
505 506
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
}
石晓伟 已提交
507

508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
class RecordedCudaMallocHelper {
 private:
  explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0)
      : dev_id_(dev_id), limit_size_(limit_size) {
    if (NeedRecord()) {
      mtx_.reset(new std::mutex());
    }
  }

  DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper);

 public:
  static RecordedCudaMallocHelper *Instance(int dev_id) {
    std::call_once(once_flag_, [] {
      int dev_cnt = GetCUDADeviceCount();
      instances_.reserve(dev_cnt);
      for (int i = 0; i < dev_cnt; ++i) {
        instances_.emplace_back(
            new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
      }
    });

    PADDLE_ENFORCE_GE(
        dev_id, 0,
        platform::errors::OutOfRange(
G
GaoWei8 已提交
533
            "Device id must be not less than 0, but got %d.", dev_id));
534 535
    PADDLE_ENFORCE_LT(
        dev_id, instances_.size(),
G
GaoWei8 已提交
536
        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
537 538 539 540 541 542 543 544 545
                                     dev_id, instances_.size()));
    return instances_[dev_id].get();
  }

  /**
   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
   * or cudaSuccess would be returned, and the cudaGetLastError() flag
   * would be clear.
   */
546
  gpuError_t Malloc(void **ptr, size_t size) {
547
    LockGuardPtr<std::mutex> lock(mtx_);
548
    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
549 550 551
#ifdef PADDLE_WITH_HIP
      return hipErrorOutOfMemory;
#else
552
      return cudaErrorMemoryAllocation;
553
#endif
554 555 556
    }

    CUDADeviceGuard guard(dev_id_);
557 558 559
#ifdef PADDLE_WITH_HIP
    auto result = hipMalloc(ptr, size);
#else
560
    auto result = cudaMalloc(ptr, size);
561 562
#endif
    if (result == gpuSuccess) {
563
      cur_size_.fetch_add(size);
H
hutuxian 已提交
564
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
565
      return gpuSuccess;
566 567
    } else {
      RaiseNonOutOfMemoryError(&result);
568 569 570 571 572 573
// Non out of memory error would be raised inside
// RaiseNonOutOfMemoryError. Therefore, we can
// return cudaErrorMemoryAllocation directly here.
#ifdef PADDLE_WITH_HIP
      return hipErrorOutOfMemory;
#else
574
      return cudaErrorMemoryAllocation;
575
#endif
576 577 578 579 580 581 582 583 584 585 586 587 588 589
    }
  }

  /**
   * Free gpu memory. Usually, free is not allowed to raise error.
   * If it does raise error, the process should be crashed.
   */
  void Free(void *ptr, size_t size) {
    // Purposefully allow cudaErrorCudartUnloading, because
    // that is returned if you ever call cudaFree after the
    // driver has already shutdown. This happens only if the
    // process is terminating, in which case we don't care if
    // cudaFree succeeds.
    CUDADeviceGuard guard(dev_id_);
590 591 592 593
#ifdef PADDLE_WITH_HIP
    auto err = hipFree(ptr);
    if (err != hipErrorDeinitialized) {
#else
594 595
    auto err = cudaFree(ptr);
    if (err != cudaErrorCudartUnloading) {
596
#endif
597
      PADDLE_ENFORCE_CUDA_SUCCESS(err);
598
      cur_size_.fetch_sub(size);
H
hutuxian 已提交
599
      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
600
    } else {
601 602 603
#ifdef PADDLE_WITH_HIP
      hipGetLastError();  // clear the error flag when hipErrorDeinitialized
#else
604
      cudaGetLastError();  // clear the error flag when cudaErrorCudartUnloading
605
#endif
606 607 608 609 610 611 612
    }
  }

  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
                  size_t *actual_total) {
    {
      CUDADeviceGuard guard(dev_id_);
613 614 615
#ifdef PADDLE_WITH_HIP
      auto result = hipMemGetInfo(actual_avail, actual_total);
#else
616
      auto result = cudaMemGetInfo(actual_avail, actual_total);
617 618
#endif
      if (result != gpuSuccess) {
619 620 621 622 623 624 625
        *actual_avail = 0;
      }
      RaiseNonOutOfMemoryError(&result);
    }

    if (NeedRecord()) {
      std::lock_guard<std::mutex> guard(*mtx_);
626
      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
627 628 629 630 631 632 633 634 635 636 637
      *total = std::min(*actual_total, limit_size_);
      return *total < *actual_total;
    } else {
      *avail = *actual_avail;
      *total = *actual_total;
      return false;
    }
  }

  inline bool NeedRecord() const { return limit_size_ != 0; }

638
  uint64_t RecordedSize() const { return cur_size_.load(); }
639 640 641 642 643 644

  uint64_t LimitSize() const { return limit_size_; }

 private:
  const int dev_id_;
  const uint64_t limit_size_;
645
  std::atomic<uint64_t> cur_size_{0};
646 647 648 649 650

  mutable std::unique_ptr<std::mutex> mtx_;

  static std::once_flag once_flag_;
  static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
651
};  // NOLINT
652 653 654 655 656

std::once_flag RecordedCudaMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
    RecordedCudaMallocHelper::instances_;

657
gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
  return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
}

void RecordedCudaFree(void *p, size_t size, int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
}

bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
                            size_t *actual_total, int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
      avail, total, actual_avail, actual_total);
}

uint64_t RecordedCudaMallocSize(int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize();
}

bool IsCudaMallocRecorded(int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
}

679 680 681 682 683 684 685
void EmptyCache(void) {
  std::vector<int> devices = GetSelectedDevices();
  for (auto device : devices) {
    memory::Release(CUDAPlace(device));
  }
}

L
liaogang 已提交
686 687
}  // namespace platform
}  // namespace paddle