system_allocator.cc 17.8 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
D
dzhwinter 已提交
14
#define GLOG_NO_ABBREVIATED_SEVERITIES
15

Y
Yi Wang 已提交
16
#include "paddle/fluid/memory/detail/system_allocator.h"
17

18 19
#include "paddle/fluid/memory/stats.h"

D
dzhwinter 已提交
20 21
#ifdef _WIN32
#include <malloc.h>
22 23 24
#ifndef NOMINMAX
#define NOMINMAX  // msvc max/min macro conflict with std::min/max
#endif
D
dzhwinter 已提交
25 26
#include <windows.h>  // VirtualLock/VirtualUnlock
#else
27
#include <sys/mman.h>  // for mlock and munlock
D
dzhwinter 已提交
28
#endif
29
#include "gflags/gflags.h"
30
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yi Wang 已提交
31
#include "paddle/fluid/platform/cpu_info.h"
32
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
33
#include "paddle/fluid/platform/device/npu/npu_info.h"
Y
Yi Wang 已提交
34
#include "paddle/fluid/platform/enforce.h"
F
fwenguang 已提交
35 36 37
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
38

39
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
40 41
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
42

43
#include "paddle/fluid/platform/device/device_wrapper.h"
44
#include "paddle/fluid/platform/profiler/mem_tracing.h"
45

S
sneaxiy 已提交
46
DECLARE_bool(use_pinned_memory);
47
DECLARE_double(fraction_of_gpu_memory_to_use);
48 49
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
Z
zhhsplendid 已提交
50

51 52 53 54
namespace paddle {
namespace memory {
namespace detail {

D
dzhwinter 已提交
55
void* AlignedMalloc(size_t size) {
G
gongweibao 已提交
56
  void* p = nullptr;
D
dzhwinter 已提交
57
  size_t alignment = 32ul;
T
tensor-tang 已提交
58
#ifdef PADDLE_WITH_MKLDNN
59
  // refer to https://github.com/01org/mkl-dnn/blob/master/include/dnnl.hpp
60
  // memory alignment
D
dzhwinter 已提交
61 62 63 64
  alignment = 4096ul;
#endif
#ifdef _WIN32
  p = _aligned_malloc(size, alignment);
65
#else
66 67
  int error = posix_memalign(&p, alignment, size);
  PADDLE_ENFORCE_EQ(
68 69
      error,
      0,
70 71
      platform::errors::ResourceExhausted(
          "Fail to alloc memory of %ld size, error code is %d.", size, error));
72
#endif
73 74 75
  PADDLE_ENFORCE_NOT_NULL(p,
                          platform::errors::ResourceExhausted(
                              "Fail to alloc memory of %ld size.", size));
D
dzhwinter 已提交
76 77 78 79 80 81 82 83 84 85 86 87
  return p;
}

void* CPUAllocator::Alloc(size_t* index, size_t size) {
  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
  // malloc might not return nullptr if size is zero, but the returned
  // pointer shall not be dereferenced -- so we make it nullptr.
  if (size <= 0) return nullptr;

  *index = 0;  // unlock memory

  void* p = AlignedMalloc(size);
88 89 90

  if (p != nullptr) {
    if (FLAGS_use_pinned_memory) {
Y
Yi Wang 已提交
91
      *index = 1;
D
dzhwinter 已提交
92 93 94
#ifdef _WIN32
      VirtualLock(p, size);
#else
95
      mlock(p, size);  // lock memory
D
dzhwinter 已提交
96
#endif
97
    }
98
  }
99

100
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
101 102
  platform::RecordMemEvent(
      p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
103 104 105
  return p;
}

L
liaogang 已提交
106
void CPUAllocator::Free(void* p, size_t size, size_t index) {
107
  if (p != nullptr && index == 1) {
D
dzhwinter 已提交
108 109 110
#ifdef _WIN32
    VirtualUnlock(p, size);
#else
111
    munlock(p, size);
D
dzhwinter 已提交
112
#endif
113
  }
P
peizhilin 已提交
114 115 116
#ifdef _WIN32
  _aligned_free(p);
#else
117
  free(p);
P
peizhilin 已提交
118
#endif
119 120

  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
121 122
  platform::RecordMemEvent(
      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
123 124
}

L
liaogang 已提交
125
bool CPUAllocator::UseGpu() const { return false; }
L
liaogang 已提交
126

127
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
128

Y
Yi Wang 已提交
129
void* GPUAllocator::Alloc(size_t* index, size_t size) {
130 131
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
L
liaogang 已提交
132
  if (size <= 0) return nullptr;
Y
Yu Yang 已提交
133

134
  void* p;
135
  auto result = platform::RecordedGpuMalloc(&p, size, gpu_id_);
Y
Yu Yang 已提交
136

137
  if (result == gpuSuccess) {
Y
Yi Wang 已提交
138
    *index = 0;
139
    gpu_alloc_size_ += size;
L
liaogang 已提交
140
    return p;
141
  } else {
142
    size_t avail, total, actual_avail, actual_total;
143
    bool is_limited = platform::RecordedGpuMemGetInfo(
144
        &avail, &total, &actual_avail, &actual_total, gpu_id_);
145
    size_t allocated = total - avail;
146 147 148 149 150 151 152 153 154

    std::string err_msg;
    if (is_limited) {
      auto limit_size = (total >> 20);
      err_msg = string::Sprintf(
          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum GPU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
155 156
          limit_size,
          limit_size);
157
    }
158

159
    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
160
        "\n\nOut of memory error on GPU %d. "
161
        "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
162 163 164 165 166 167 168 169
        "available memory is only %s.\n\n"
        "Please check whether there is any other process using GPU %d.\n"
        "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
        "2. If no, please try one of the following suggestions:\n"
        "   1) Decrease the batch size of your model.\n"
        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
170
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
171 172 173 174 175 176 177 178
        gpu_id_,
        string::HumanReadableSize(size),
        gpu_id_,
        string::HumanReadableSize(allocated),
        string::HumanReadableSize(avail),
        gpu_id_,
        FLAGS_fraction_of_gpu_memory_to_use,
        err_msg));
L
liaogang 已提交
179
  }
180 181
}

L
liaogang 已提交
182
void GPUAllocator::Free(void* p, size_t size, size_t index) {
183 184
  PADDLE_ENFORCE_EQ(index,
                    0,
185 186
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
187 188
  PADDLE_ENFORCE_GE(gpu_alloc_size_,
                    size,
189 190 191
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
192 193
                        size,
                        gpu_alloc_size_));
194
  gpu_alloc_size_ -= size;
195

196
  platform::RecordedGpuFree(p, size, gpu_id_);
197 198
}

L
liaogang 已提交
199
bool GPUAllocator::UseGpu() const { return true; }
L
liaogang 已提交
200

C
chengduoZH 已提交
201 202
// PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address.
Y
Yi Wang 已提交
203
void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
C
chengduoZH 已提交
204
  if (size <= 0) return nullptr;
C
chengduoZH 已提交
205

206
  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
C
chengduoZH 已提交
207
  // of host pinned allocation. Allocates too much would reduce
C
chengduoZH 已提交
208
  // the amount of memory available to the underlying system for paging.
C
chengduoZH 已提交
209
  size_t usable =
210
      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
C
chengduoZH 已提交
211

C
chengduoZH 已提交
212 213 214 215 216 217
  if (size > usable) {
    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
                 << " MB pinned memory."
                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
    return nullptr;
  }
C
chengduoZH 已提交
218

C
chengduoZH 已提交
219
  void* p;
220 221
// PINNED memory is visible to all CUDA contexts.
#ifdef PADDLE_WITH_HIP
222
  hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
223
#else
D
Dun Liang 已提交
224
  cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
225
#endif
C
chengduoZH 已提交
226

227
  if (result == gpuSuccess) {
Y
Yi Wang 已提交
228
    *index = 1;  // PINNED memory
C
chengduoZH 已提交
229
    cuda_pinnd_alloc_size_ += size;
230
    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
231 232
    platform::RecordMemEvent(
        p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
C
chengduoZH 已提交
233
    return p;
C
chengduoZH 已提交
234
  } else {
D
Dun Liang 已提交
235
    LOG(WARNING) << "cudaHostAlloc failed.";
C
chengduoZH 已提交
236
    return nullptr;
C
chengduoZH 已提交
237 238 239 240 241 242
  }

  return nullptr;
}

void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
243
  gpuError_t err;
244 245
  PADDLE_ENFORCE_EQ(index,
                    1,
246 247
                    platform::errors::InvalidArgument(
                        "The index should be 1, but got %d", index));
248

249 250
  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
                    size,
251 252 253
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated cuda pinned memory (%d)",
254 255
                        size,
                        cuda_pinnd_alloc_size_));
C
chengduoZH 已提交
256
  cuda_pinnd_alloc_size_ -= size;
257 258 259 260
#ifdef PADDLE_WITH_HIP
  err = hipHostFree(p);
  if (err != hipErrorDeinitialized) {
    PADDLE_ENFORCE_EQ(
261 262
        err,
        hipSuccess,
263 264 265 266
        platform::errors::Fatal(
            "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
  }
#else
C
chengduoZH 已提交
267 268 269
  err = cudaFreeHost(p);

  // Purposefully allow cudaErrorCudartUnloading, because
C
chengduoZH 已提交
270
  // that is returned if you ever call cudaFreeHost after the
C
chengduoZH 已提交
271 272
  // driver has already shutdown. This happens only if the
  // process is terminating, in which case we don't care if
C
chengduoZH 已提交
273
  // cudaFreeHost succeeds.
C
chengduoZH 已提交
274
  if (err != cudaErrorCudartUnloading) {
275
    PADDLE_ENFORCE_EQ(
276 277
        err,
        0,
278 279 280
        platform::errors::Fatal(
            "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
            err));
C
chengduoZH 已提交
281
  }
282
#endif
283
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
284 285
  platform::RecordMemEvent(
      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
C
chengduoZH 已提交
286 287
}

C
chengduoZH 已提交
288
bool CUDAPinnedAllocator::UseGpu() const { return false; }
C
chengduoZH 已提交
289

L
Luo Tao 已提交
290
#endif
291

292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
#ifdef PADDLE_WITH_ASCEND_CL
void* NPUAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

  void* p;
  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);

  if (result == ACL_ERROR_NONE) {
    *index = 0;
    npu_alloc_size_ += size;
    return p;
  } else {
    size_t avail, total, actual_avail, actual_total;
    bool is_limited = platform::RecordedNPUMemGetInfo(
        &avail, &total, &actual_avail, &actual_total, npu_id_);

    std::string err_msg;
    if (is_limited) {
      auto limit_size = (total >> 20);
      err_msg = string::Sprintf(
          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum GPU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
316 317
          limit_size,
          limit_size);
318 319 320 321 322 323 324 325 326 327 328 329 330 331
    }

    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
        "\n\nOut of memory error on NPU %d. "
        "Cannot allocate %s memory on NPU %d, "
        "available memory is only %s.\n\n"
        "Please check whether there is any other process using NPU %d.\n"
        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
        "2. If no, please try one of the following suggestions:\n"
        "   1) Decrease the batch size of your model.\n"
        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
332 333 334 335 336 337 338
        npu_id_,
        string::HumanReadableSize(size),
        npu_id_,
        string::HumanReadableSize(avail),
        npu_id_,
        FLAGS_fraction_of_gpu_memory_to_use,
        err_msg));
339 340 341 342 343
  }
}

void NPUAllocator::Free(void* p, size_t size, size_t index) {
  VLOG(4) << "Free " << p << " size " << size;
344 345
  PADDLE_ENFORCE_EQ(index,
                    0,
346 347
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
348 349
  PADDLE_ENFORCE_GE(npu_alloc_size_,
                    size,
350 351 352
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
353 354
                        size,
                        npu_alloc_size_));
355 356 357 358 359 360
  npu_alloc_size_ -= size;

  platform::RecordedNPUFree(p, size, npu_id_);
}

bool NPUAllocator::UseGpu() const { return true; }
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376

void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

  size_t usable =
      paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;

  if (size > usable) {
    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
                 << " MB pinned memory."
                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
    return nullptr;
  }

  void* p;
  // PINNED memory is visible to all NPU contexts.
377
  auto result = platform::NPUHostMalloc(&p, size);
378 379 380 381 382 383

  if (result == ACL_ERROR_NONE) {
    *index = 1;  // PINNED memory
    npu_pinnd_alloc_size_ += size;
    return p;
  } else {
384
    LOG(WARNING) << "NPUHostMalloc failed.";
385 386 387 388 389 390 391 392
    return nullptr;
  }

  return nullptr;
}

void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
  aclError err;
393 394
  PADDLE_ENFORCE_EQ(index,
                    1,
395 396
                    platform::errors::InvalidArgument(
                        "The index should be 1, but got %d", index));
397

398 399
  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
                    size,
400 401 402
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated npu pinned memory (%d)",
403 404
                        size,
                        npu_pinnd_alloc_size_));
405
  npu_pinnd_alloc_size_ -= size;
406
  err = platform::NPUHostFree(p);
407 408 409

  if (err != ACL_ERROR_NONE) {
    PADDLE_ENFORCE_EQ(
410 411
        err,
        0,
412
        platform::errors::Fatal(
413
            "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
414 415 416 417 418
  }
}

bool NPUPinnedAllocator::UseGpu() const { return false; }

419 420
#endif

F
fwenguang 已提交
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
#ifdef PADDLE_WITH_MLU
void* MLUAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

  void* p;
  auto result = platform::RecordedMLUMalloc(&p, size, mlu_id_);

  if (result == cnrtSuccess) {
    *index = 0;
    mlu_alloc_size_ += size;
    return p;
  } else {
    size_t avail, total, actual_avail, actual_total;
    bool is_limited = platform::RecordedMLUMemGetInfo(
        &avail, &total, &actual_avail, &actual_total, mlu_id_);
    size_t allocated = total - avail;

    std::string err_msg;
    if (is_limited) {
      auto limit_size = (total >> 20);
      err_msg = string::Sprintf(
          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum MLU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
446 447
          limit_size,
          limit_size);
F
fwenguang 已提交
448 449 450 451 452 453 454 455 456 457 458 459 460 461
    }

    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
        "\n\nOut of memory error on MLU %d. "
        "Cannot allocate %s memory on MLU %d, %s memory has been allocated and "
        "available memory is only %s.\n\n"
        "Please check whether there is any other process using MLU %d.\n"
        "1. If yes, please stop them, or start PaddlePaddle on another MLU.\n"
        "2. If no, please try one of the following suggestions:\n"
        "   1) Decrease the batch size of your model.\n"
        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
462 463 464 465 466 467 468 469
        mlu_id_,
        string::HumanReadableSize(size),
        mlu_id_,
        string::HumanReadableSize(allocated),
        string::HumanReadableSize(avail),
        mlu_id_,
        FLAGS_fraction_of_gpu_memory_to_use,
        err_msg));
F
fwenguang 已提交
470 471 472 473
  }
}

void MLUAllocator::Free(void* p, size_t size, size_t index) {
474 475
  PADDLE_ENFORCE_EQ(index,
                    0,
476 477
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
478 479
  PADDLE_ENFORCE_GE(mlu_alloc_size_,
                    size,
F
fwenguang 已提交
480 481 482
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
483 484
                        size,
                        mlu_alloc_size_));
F
fwenguang 已提交
485 486 487 488 489 490 491 492
  mlu_alloc_size_ -= size;

  platform::RecordedMLUFree(p, size, mlu_id_);
}

bool MLUAllocator::UseGpu() const { return true; }
#endif

493 494 495 496 497 498
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void* CustomAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

  void* p;
  auto place = platform::CustomPlace(dev_type_, dev_id_);
499
  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
500 501 502 503 504 505 506 507
  p = device->MemoryAllocate(size);
  if (LIKELY(p)) {
    VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size;
    *index = 0;
    plug_alloc_size += size;
  } else {
    size_t avail, total;

508
    phi::DeviceManager::MemoryStats(place, &total, &avail);
509 510 511 512
    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
        "\n\nOut of memory error on %s %d. "
        "total memory is %s, used memory is %s, "
        "available memory is only %s.\n\n",
513 514 515
        dev_type_,
        dev_id_,
        string::HumanReadableSize(total),
516 517 518 519 520 521 522 523
        string::HumanReadableSize(total - avail),
        string::HumanReadableSize(avail)));
  }
  return p;
}

void CustomAllocator::Free(void* p, size_t size, size_t index) {
  VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
524 525
  PADDLE_ENFORCE_EQ(index,
                    0,
526 527
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
528 529
  PADDLE_ENFORCE_GE(plug_alloc_size,
                    size,
530 531 532
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
533 534
                        size,
                        plug_alloc_size));
535 536
  plug_alloc_size -= size;
  auto place = platform::CustomPlace(dev_type_, dev_id_);
537
  auto device = phi::DeviceManager::GetDeviceWithPlace(place);
538 539 540 541 542 543
  device->MemoryDeallocate(p, size);
}

bool CustomAllocator::UseGpu() const { return true; }
#endif

544 545 546
}  // namespace detail
}  // namespace memory
}  // namespace paddle