naive_best_fit_allocator.cc 31.0 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"

17
#include <mutex>
M
minqiyang 已提交
18

19
#include "gflags/gflags.h"
Y
Yu Yang 已提交
20 21
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
22
#include "paddle/fluid/memory/detail/system_allocator.h"
23
#include "paddle/fluid/platform/device/device_wrapper.h"
24
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
25
#include "paddle/fluid/platform/enforce.h"
C
chengduo 已提交
26
#include "paddle/fluid/platform/profiler.h"
27

Y
Yu Yang 已提交
28
#include "paddle/fluid/string/printf.h"
29
#include "paddle/fluid/string/split.h"
30
#include "paddle/phi/common/place.h"
31
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
32 33
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
Y
Yu Yang 已提交
34

Z
Zeng Jinle 已提交
35 36 37 38 39 40 41
PADDLE_DEFINE_EXPORTED_bool(
    init_allocated_mem, false,
    "It is a mistake that the values of the memory allocated by "
    "BuddyAllocator are always zeroed in some op's implementation. "
    "To find this error in time, we use init_allocated_mem to indicate "
    "that initializing the allocated memory with a small value "
    "during unit testing.");
Y
Yu Yang 已提交
42
DECLARE_double(fraction_of_gpu_memory_to_use);
43 44
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
D
dzhwinter 已提交
45
DECLARE_bool(benchmark);
Y
Yu Yang 已提交
46 47 48 49 50 51 52 53

namespace paddle {
namespace memory {
namespace legacy {
template <typename Place>
void *Alloc(const Place &place, size_t size);

template <typename Place>
L
liuwei1031 已提交
54
void Free(const Place &place, void *p, size_t size);
Y
Yu Yang 已提交
55

56
template <typename Place>
W
Wilber 已提交
57
uint64_t Release(const Place &place);
58

Y
Yu Yang 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
template <typename Place>
size_t Used(const Place &place);

struct Usage : public boost::static_visitor<size_t> {
  size_t operator()(const platform::CPUPlace &cpu) const;
  size_t operator()(const platform::CUDAPlace &gpu) const;
  size_t operator()(const platform::CUDAPinnedPlace &cuda_pinned) const;
};

size_t memory_usage(const platform::Place &p);

using BuddyAllocator = detail::BuddyAllocator;

BuddyAllocator *GetCPUBuddyAllocator() {
  // We tried thread_local for inference::RNN1 model, but that not works much
  // for multi-thread test.
  static std::once_flag init_flag;
  static detail::BuddyAllocator *a = nullptr;

  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
S
sneaxiy 已提交
81
        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
Y
Yu Yang 已提交
82 83 84 85 86 87 88
  });

  return a;
}

template <>
void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
G
gongweibao 已提交
89
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
Y
Yu Yang 已提交
90 91 92 93
  void *p = GetCPUBuddyAllocator()->Alloc(size);
  if (FLAGS_init_allocated_mem) {
    memset(p, 0xEF, size);
  }
M
minqiyang 已提交
94
  VLOG(10) << "  pointer=" << p;
Y
Yu Yang 已提交
95 96 97 98
  return p;
}

template <>
L
liuwei1031 已提交
99 100
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
                              size_t size) {
G
gongweibao 已提交
101
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
Y
Yu Yang 已提交
102 103 104
  GetCPUBuddyAllocator()->Free(p);
}

105
template <>
W
Wilber 已提交
106 107
uint64_t Release<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Release();
108 109
}

Y
Yu Yang 已提交
110 111 112 113 114
template <>
size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
}

J
jianghaicheng 已提交
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
// For Graphcore IPU
template <>
void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  VLOG(10) << "IPUPlace, Allocate on cpu.";

  void *p = GetCPUBuddyAllocator()->Alloc(size);
  if (FLAGS_init_allocated_mem) {
    memset(p, 0xEF, size);
  }
  VLOG(10) << "  pointer=" << p;
  return p;
}
template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
                              size_t size) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
}
template <>
uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
  return GetCPUBuddyAllocator()->Release();
}
template <>
size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
}

143
// For kunlun XPU
144 145 146 147 148
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  void *p = nullptr;
149 150 151

  platform::XPUDeviceGuard gurad(place.device);
  int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
152
  if (ret != XPU_SUCCESS) {
153
    VLOG(10) << "xpu memory malloc(" << size << ") failed, try again";
154 155 156
    xpu_wait();
    ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
  }
157 158 159 160
  PADDLE_ENFORCE_EQ(
      ret, XPU_SUCCESS,
      platform::errors::External(
          "XPU API return wrong value[%d], no enough memory", ret));
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
  if (FLAGS_init_allocated_mem) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "xpu memory FLAGS_init_allocated_mem is not implemented."));
  }
  VLOG(10) << "  pointer=" << p;
  return p;
#else
  PADDLE_THROW(
      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
  return nullptr;
#endif
}

template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
                              size_t size) {
#ifdef PADDLE_WITH_XPU
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
180 181

  platform::XPUDeviceGuard gurad(place.device);
182 183 184 185 186 187 188
  xpu_free(p);
#else
  PADDLE_THROW(
      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}

189
template <>
W
Wilber 已提交
190
uint64_t Release<platform::XPUPlace>(const platform::XPUPlace &place) {
191
#ifdef PADDLE_WITH_XPU
192
  LOG(WARNING) << "Release XPU pool is not supported now, no action here.";
193 194 195 196
#else
  PADDLE_THROW(
      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
197
  return -1;
198 199
}

200 201 202 203 204 205 206 207 208 209 210
template <>
size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU
  printf("Used func return 0 for XPUPlace\n");
  return 0;
#else
  PADDLE_THROW(
      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}

211 212
// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
213
constexpr int EXTRA_PADDING_SIZE = 32;
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
class NPUBuddyAllocatorList {
 private:
  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
    auto npu_num = devices_.size();
    allocators_.resize(npu_num);
    init_flags_.reserve(npu_num);
    for (size_t i = 0; i < npu_num; ++i) {
      init_flags_.emplace_back(new std::once_flag());
    }
  }

  static NPUBuddyAllocatorList *CreateNewInstance() {
    return new NPUBuddyAllocatorList();
  }

 public:
  static NPUBuddyAllocatorList *Instance() {
    static auto *instance = CreateNewInstance();
    return instance;
  }

  BuddyAllocator *Get(int npu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
    PADDLE_ENFORCE_LT(pos, devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
                          devices_.size(), pos));

    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetNPUDeviceId(devices_[pos]);
246 247 248 249 250
      allocators_[pos].reset(
          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::NPUAllocator(devices_[pos])),
                             platform::NPUMinChunkSize(),
                             platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
               << "or 'FLAGS_initial_gpu_memory_in_mb' "
               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
               << "to change the memory size for GPU usage.\n"
               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
               << FLAGS_fraction_of_gpu_memory_to_use
               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
               << FLAGS_initial_gpu_memory_in_mb
               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
    });

    return allocators_[pos].get();
  }

 private:
  std::vector<int> devices_;
  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};

BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
}
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291

BuddyAllocator *GetNPUPinnedBuddyAllocator() {
  static std::once_flag init_flag;
  static BuddyAllocator *ba = nullptr;

  std::call_once(init_flag, []() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::NPUPinnedAllocator),
                            platform::NPUPinnedMinChunkSize(),
                            platform::NPUPinnedMaxChunkSize());
  });

  return ba;
}

292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
#endif

template <>
size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
  return GetNPUBuddyAllocator(place.device)->Used();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPlace' is not supported in CPU only device."));
#endif
}

template <>
void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
  auto *ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
    platform::NPUDeviceGuard(place.device);
    size_t avail, total;
    platform::NPUMemoryUsage(&avail, &total);
    PADDLE_THROW(platform::errors::ResourceExhausted(
314 315
        "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
        "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
316 317 318 319 320 321 322
        string::HumanReadableSize(size), place.device,
        string::HumanReadableSize(avail), string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
  } else {
    if (FLAGS_init_allocated_mem) {
323
      platform::NPUMemsetSync(ptr, 0xEF, size, size);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
    }
  }
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  return ptr;
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPlace' is not supported in CPU only device."));
#endif
}

template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
                              size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetNPUBuddyAllocator(place.device)->Free(p);
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPlace' is not supported in CPU only device."));
#endif
}

template <>
uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
  return GetNPUBuddyAllocator(place.device)->Release();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPlace' is not supported in CPU only device."));
#endif
}

356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
template <>
size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
  return GetNPUPinnedBuddyAllocator()->Used();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}

template <>
void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
                                      size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
  auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
  void *ptr = buddy_allocator->Alloc(size);

  if (ptr == nullptr) {
374
    LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace";
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
  }
  if (FLAGS_init_allocated_mem) {
    memset(ptr, 0xEF, size);
  }
  return ptr;
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}

template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
                                    void *p, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
  GetNPUPinnedBuddyAllocator()->Free(p);
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}

template <>
uint64_t Release<platform::NPUPinnedPlace>(
    const platform::NPUPinnedPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
  return GetNPUPinnedBuddyAllocator()->Release();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}

408
// For CUDA
409
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
class GPUBuddyAllocatorList {
 private:
  GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
    auto gpu_num = devices_.size();
    allocators_.resize(gpu_num);
    init_flags_.reserve(gpu_num);
    for (size_t i = 0; i < gpu_num; ++i) {
      init_flags_.emplace_back(new std::once_flag());
    }
  }

  static GPUBuddyAllocatorList *CreateNewInstance() {
    return new GPUBuddyAllocatorList();
  }

 public:
  static GPUBuddyAllocatorList *Instance() {
    static auto *instance = CreateNewInstance();
    return instance;
  }
Y
Yu Yang 已提交
430

431 432 433
  BuddyAllocator *Get(int gpu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
434 435 436 437 438
    PADDLE_ENFORCE_LT(pos, devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
                          devices_.size(), pos));
439 440 441 442 443 444 445

    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetDeviceId(devices_[pos]);
      allocators_[pos].reset(new BuddyAllocator(
          std::unique_ptr<detail::SystemAllocator>(
              new detail::GPUAllocator(devices_[pos])),
          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
S
sneaxiy 已提交
446 447 448 449 450 451 452 453 454 455 456 457
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
               << "or 'FLAGS_initial_gpu_memory_in_mb' "
               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
               << "to change the memory size for GPU usage.\n"
               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
               << FLAGS_fraction_of_gpu_memory_to_use
               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
               << FLAGS_initial_gpu_memory_in_mb
               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
458 459 460 461
    });

    return allocators_[pos].get();
  }
Y
Yu Yang 已提交
462

463 464 465 466 467 468 469 470
 private:
  std::vector<int> devices_;
  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};

BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
  return GPUBuddyAllocatorList::Instance()->Get(gpu_id);
Y
Yu Yang 已提交
471 472 473 474 475
}
#endif

template <>
size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
476
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
Y
Yu Yang 已提交
477 478
  return GetGPUBuddyAllocator(place.device)->Used();
#else
479 480
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
481 482 483 484 485 486
#endif
}

template <>
void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                 size_t size) {
487
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
488 489
  auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
  auto *ptr = buddy_allocator->Alloc(size);
Z
Zeng Jinle 已提交
490
  if (ptr == nullptr) {
491
    platform::CUDADeviceGuard(place.device);
Y
Yu Yang 已提交
492 493
    size_t avail, total;
    platform::GpuMemoryUsage(&avail, &total);
494 495 496 497 498 499 500 501
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
        string::HumanReadableSize(size), place.device,
        string::HumanReadableSize(avail), string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
L
liuwei1031 已提交
502 503
  } else {
    if (FLAGS_init_allocated_mem) {
504 505 506
#ifdef PADDLE_WITH_HIP
      hipMemset(ptr, 0xEF, size);
#else
L
liuwei1031 已提交
507
      cudaMemset(ptr, 0xEF, size);
508
#endif
L
liuwei1031 已提交
509
    }
Y
Yu Yang 已提交
510 511 512
  }
  return ptr;
#else
513 514
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
515 516 517 518
#endif
}

template <>
L
liuwei1031 已提交
519 520
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
                               size_t size) {
521
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
522 523
  GetGPUBuddyAllocator(place.device)->Free(p);
#else
524 525
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
526 527 528
#endif
}

529
template <>
W
Wilber 已提交
530
uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
531
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
W
Wilber 已提交
532
  return GetGPUBuddyAllocator(place.device)->Release();
533 534 535 536 537 538
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPlace' is not supported in CPU only device."));
#endif
}

539
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
  static std::once_flag init_flag;
  static BuddyAllocator *ba = nullptr;

  std::call_once(init_flag, []() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
                            platform::CUDAPinnedMinChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
  });

  return ba;
}
#endif

template <>
size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
557
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
558 559
  return GetCUDAPinnedBuddyAllocator()->Used();
#else
560 561
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPinnedPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
562 563 564 565 566 567
#endif
}

template <>
void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                       size_t size) {
568
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
569 570 571 572
  auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
  void *ptr = buddy_allocator->Alloc(size);

  if (ptr == nullptr) {
D
Dun Liang 已提交
573
    LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size
Y
Yu Yang 已提交
574 575 576 577 578 579 580
                 << " bytes in CUDAPinnedPlace";
  }
  if (FLAGS_init_allocated_mem) {
    memset(ptr, 0xEF, size);
  }
  return ptr;
#else
581 582
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPinnedPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
583 584 585 586 587
#endif
}

template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
L
liuwei1031 已提交
588
                                     void *p, size_t size) {
589
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
590 591
  GetCUDAPinnedBuddyAllocator()->Free(p);
#else
592 593
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPinnedPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
594 595 596
#endif
}

597
template <>
W
Wilber 已提交
598
uint64_t Release<platform::CUDAPinnedPlace>(
599
    const platform::CUDAPinnedPlace &place) {
600
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
W
Wilber 已提交
601
  return GetCUDAPinnedBuddyAllocator()->Release();
602 603 604 605 606 607
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPinnedPlace' is not supported in CPU only device."));
#endif
}

F
fwenguang 已提交
608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
// For MLU
#ifdef PADDLE_WITH_MLU
class MLUBuddyAllocatorList {
 private:
  MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) {
    auto mlu_num = devices_.size();
    allocators_.resize(mlu_num);
    init_flags_.reserve(mlu_num);
    for (size_t i = 0; i < mlu_num; ++i) {
      init_flags_.emplace_back(new std::once_flag());
    }
  }

  static MLUBuddyAllocatorList *CreateNewInstance() {
    return new MLUBuddyAllocatorList();
  }

 public:
  static MLUBuddyAllocatorList *Instance() {
    static auto *instance = CreateNewInstance();
    return instance;
  }

  BuddyAllocator *Get(int mlu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
    PADDLE_ENFORCE_LT(pos, devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
                          devices_.size(), pos));

    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetMLUDeviceId(devices_[pos]);
      allocators_[pos].reset(new BuddyAllocator(
          std::unique_ptr<detail::SystemAllocator>(
              new detail::MLUAllocator(devices_[pos])),
          platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "(mlu reuse gpu GFlags) "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
               << "or 'FLAGS_initial_gpu_memory_in_mb' "
               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
               << "to change the memory size for MLU usage.\n"
               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
               << FLAGS_fraction_of_gpu_memory_to_use
               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
               << FLAGS_initial_gpu_memory_in_mb
               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
    });

    return allocators_[pos].get();
  }

 private:
  std::vector<int> devices_;
  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};

BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) {
  return MLUBuddyAllocatorList::Instance()->Get(mlu_id);
}
#endif

template <>
size_t Used<platform::MLUPlace>(const platform::MLUPlace &place) {
#ifdef PADDLE_WITH_MLU
  return GetMLUBuddyAllocator(place.device)->Used();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'MLUPlace' is not supported in CPU only device."));
#endif
}

template <>
void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
#ifdef PADDLE_WITH_MLU
  auto *buddy_allocator = GetMLUBuddyAllocator(place.device);
  auto *ptr = buddy_allocator->Alloc(size);
  if (ptr == nullptr) {
    platform::MLUDeviceGuard(place.device);
    size_t avail = 0, total = 0;
    platform::MLUMemoryUsage(&avail, &total);
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
        "%s, MLUMinChunkSize %s, MLU memory used: %s.",
        string::HumanReadableSize(size), place.device,
        string::HumanReadableSize(avail), string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::MLUPlace>(place))));
  } else {
    if (FLAGS_init_allocated_mem) {
      cnrtMemset(ptr, 0xEF, size);
    }
  }
  return ptr;
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'MLUPlace' is not supported in CPU only device."));
#endif
}

template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
                              size_t size) {
#ifdef PADDLE_WITH_MLU
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetMLUBuddyAllocator(place.device)->Free(p);
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'MLUPlace' is not supported in CPU only device."));
#endif
}

template <>
uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
#ifdef PADDLE_WITH_MLU
  return GetMLUBuddyAllocator(place.device)->Release();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'MLUPlace' is not supported in CPU only device."));
#endif
}

736 737 738 739 740 741
// For CustomDevice
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class BuddyAllocatorList {
 private:
  explicit BuddyAllocatorList(const std::string &device_type)
      : device_type_(device_type) {
742
    auto devices = phi::DeviceManager::GetDeviceList(device_type);
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
    for (auto dev_id : devices) {
      init_flags_[dev_id].reset(new std::once_flag());
    }
  }

  static BuddyAllocatorList *CreateNewInstance(const std::string &device_type) {
    return new BuddyAllocatorList(device_type);
  }

 public:
  static BuddyAllocatorList *Instance(const std::string &device_type) {
    // DeviceType -> AllocatorList
    static std::unordered_map<std::string, BuddyAllocatorList *> pool;
    if (pool.find(device_type) == pool.end()) {
      pool[device_type] = CreateNewInstance(device_type);
    }
    return pool[device_type];
  }

  BuddyAllocator *Get(int dev_id) {
    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
                      platform::errors::OutOfRange(
                          "Cannot find %s %d, please check visible devices.",
                          device_type_, dev_id));

    std::call_once(*init_flags_[dev_id], [this, dev_id] {
769
      phi::DeviceManager::SetDevice(device_type_, dev_id);
770 771 772 773 774
      platform::CustomPlace place(device_type_, dev_id);

      allocators_[dev_id].reset(new BuddyAllocator(
          std::unique_ptr<detail::SystemAllocator>(
              new detail::CustomAllocator(device_type_, dev_id)),
775 776 777
          phi::DeviceManager::GetMinChunkSize(place),
          phi::DeviceManager::GetMaxChunkSize(place),
          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
    });

    return allocators_[dev_id].get();
  }

 private:
  std::string device_type_;
  std::unordered_map<size_t, std::unique_ptr<std::once_flag>> init_flags_;
  std::unordered_map<size_t, std::unique_ptr<BuddyAllocator>> allocators_;
};

BuddyAllocator *GetBuddyAllocator(const platform::Place &place) {
  VLOG(10) << "GetBuddyAllocator place = " << place;
  if (platform::is_custom_place(place)) {
    return BuddyAllocatorList::Instance(
               platform::PlaceHelper::GetDeviceType(place))
        ->Get(platform::PlaceHelper::GetDeviceId(place));
  } else {
    PADDLE_THROW(
        platform::errors::InvalidArgument("place must be CustomPlace"));
  }
}
#endif

template <>
void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
                                   size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  auto *buddy_allocator = GetBuddyAllocator(place);
  auto *ptr = buddy_allocator->Alloc(size);

  if (ptr == nullptr) {
811
    phi::DeviceGuard guard(place);
812
    size_t avail, total;
813
    phi::DeviceManager::MemoryStats(place, &total, &avail);
814 815 816 817 818 819 820 821
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
        "%s. ",
        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
        string::HumanReadableSize(avail), string::HumanReadableSize(total),
        string::HumanReadableSize(total - avail)));
  } else {
    if (FLAGS_init_allocated_mem) {
822
      phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size);
823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
    }
  }
  VLOG(10) << "  pointer=" << ptr;
  return ptr;
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CustomPlace' is not supported in CPU only device."));
#endif
}

template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
                                 size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetBuddyAllocator(place)->Free(p);
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CustomPlace' is not supported in CPU only device."));
#endif
}

template <>
uint64_t Release<platform::CustomPlace>(const platform::CustomPlace &place) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  return GetBuddyAllocator(place)->Release();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CustomPlace' is not supported in CPU only device."));
#endif
}

template <>
size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  return GetBuddyAllocator(place)->Used();
#else
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CustomPlace' is not supported in CPU only device."));
#endif
}

Y
Yu Yang 已提交
865 866 867 868 869 870 871 872 873 874 875 876 877
struct AllocVisitor : public boost::static_visitor<void *> {
  inline explicit AllocVisitor(size_t size) : size_(size) {}

  template <typename Place>
  inline void *operator()(const Place &place) const {
    return Alloc<Place>(place, size_);
  }

 private:
  size_t size_;
};

struct FreeVisitor : public boost::static_visitor<void> {
L
liuwei1031 已提交
878 879
  inline explicit FreeVisitor(void *ptr, size_t size)
      : ptr_(ptr), size_(size) {}
Y
Yu Yang 已提交
880 881 882

  template <typename Place>
  inline void operator()(const Place &place) const {
L
liuwei1031 已提交
883
    Free<Place>(place, ptr_, size_);
Y
Yu Yang 已提交
884 885 886 887
  }

 private:
  void *ptr_;
L
liuwei1031 已提交
888
  size_t size_;
Y
Yu Yang 已提交
889 890
};

W
Wilber 已提交
891
struct ReleaseVisitor : public boost::static_visitor<uint64_t> {
892
  template <typename Place>
W
Wilber 已提交
893 894
  inline uint64_t operator()(const Place &place) const {
    return Release<Place>(place);
895 896 897
  }
};

Y
Yu Yang 已提交
898 899 900 901 902
size_t Usage::operator()(const platform::CPUPlace &cpu) const {
  return Used(cpu);
}

size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
903
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
904 905
  return Used(gpu);
#else
906 907
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
908 909 910 911
#endif
}

size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
912
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Y
Yu Yang 已提交
913 914
  return Used(cuda_pinned);
#else
915 916
  PADDLE_THROW(platform::errors::PermissionDenied(
      "'CUDAPinnedPlace' is not supported in CPU only device."));
Y
Yu Yang 已提交
917 918 919 920 921
#endif
}
}  // namespace legacy

namespace allocation {
922

923
phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
924
  void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
C
chengduo 已提交
925 926 927 928
  auto *tmp_alloc = new Allocation(ptr, size, place_);
  platform::MemEvenRecorder::Instance().PushMemRecord(
      static_cast<void *>(tmp_alloc), place_, size);
  return tmp_alloc;
Y
Yu Yang 已提交
929 930
}

931
void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
932 933 934
  paddle::platform::VisitPlace(
      allocation->place(),
      legacy::FreeVisitor(allocation->ptr(), allocation->size()));
C
chengduo 已提交
935 936
  platform::MemEvenRecorder::Instance().PopMemRecord(
      static_cast<void *>(allocation), place_);
Y
Yu Yang 已提交
937 938
  delete allocation;
}
939

W
Wilber 已提交
940
uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
941
  return paddle::platform::VisitPlace(place, legacy::ReleaseVisitor());
942 943
}

Y
Yu Yang 已提交
944 945 946
}  // namespace allocation
}  // namespace memory
}  // namespace paddle