allocator_facade.cc 42.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
18
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
19
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
20
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
21
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
22
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
S
sneaxiy 已提交
23
#include "paddle/fluid/memory/allocation/retry_allocator.h"
24
#include "paddle/fluid/memory/allocation/stat_allocator.h"
S
sneaxiy 已提交
25
#include "paddle/fluid/platform/enforce.h"
26
#include "paddle/fluid/platform/place.h"
27
#include "paddle/phi/core/macros.h"
28

29
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
30
#include <shared_mutex>
31

32
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
33
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
S
sneaxiy 已提交
34
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
35
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
36
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
37
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
38
#include "paddle/fluid/platform/device_context.h"
39
#include "paddle/phi/backends/gpu/gpu_context.h"
40 41

#ifdef PADDLE_WITH_CUDA
42
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
43
#endif
44

45 46 47 48 49
#if CUDA_VERSION >= 10020
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
50
#endif
51

52
#ifdef PADDLE_WITH_XPU
53
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
54
#endif
55

J
jianghaicheng 已提交
56 57 58 59
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif

60 61 62 63
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#endif
64
#include "paddle/fluid/platform/flags.h"
65

Z
Zeng Jinle 已提交
66
PADDLE_DEFINE_EXPORTED_int64(
67 68
    gpu_allocator_retry_time,
    10000,
S
sneaxiy 已提交
69 70 71
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
72
PADDLE_DEFINE_EXPORTED_bool(
73 74
    use_system_allocator,
    false,
Z
Zeng Jinle 已提交
75 76
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
77

78 79
PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth,
                            false,
80 81
                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");

82 83 84
// NOTE(Ruibiao): This FLAGS is just to be compatibled with
// the old single-stream CUDA allocator. It will be removed
// after StreamSafeCudaAllocator has been fully tested.
85 86
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator,
                            true,
87 88
                            "Enable StreamSafeCUDAAllocator");

89 90
PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
                            false,
91 92 93 94
                            "Whether to use CUDAManagedAllocator to allocate "
                            "managed memory, only available for auto_growth "
                            "strategy");

95 96
PHI_DECLARE_string(allocator_strategy);
PHI_DECLARE_uint64(auto_growth_chunk_size_in_mb);
97

98 99 100 101
namespace paddle {
namespace memory {
namespace allocation {

102 103 104 105 106 107 108 109
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
    : public Allocator,
      public std::enable_shared_from_this<CUDAGraphAllocator> {
 private:
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
110
                      DecoratedAllocationPtr underlying_allocation)
111 112 113 114
        : Allocation(underlying_allocation->ptr(),
                     underlying_allocation->base_ptr(),
                     underlying_allocation->size(),
                     underlying_allocation->place()),
115 116 117 118 119
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}

   private:
    std::shared_ptr<Allocator> allocator_;
120
    DecoratedAllocationPtr underlying_allocation_;
121 122 123 124 125 126
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
      : underlying_allocator_(allocator) {}

 public:
127 128
  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }

129 130 131 132 133 134
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
  }

 protected:
135
  phi::Allocation* AllocateImpl(size_t size) {
136
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
137 138 139
    return new PrivateAllocation(this,
                                 static_unique_ptr_cast<Allocation>(
                                     underlying_allocator_->Allocate(size)));
140 141
  }

142
  void FreeImpl(phi::Allocation* allocation) {
143 144 145 146 147 148 149 150 151
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }

 private:
  std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

152 153
static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
154
  return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
155 156 157 158 159
#else
  return false;
#endif
}

Y
Yu Yang 已提交
160 161
class AllocatorFacadePrivate {
 public:
162 163
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

164 165 166 167 168 169
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  using CUDAAllocatorMap =
      std::map<platform::CUDAPlace,
               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
#endif

170 171
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
172 173
    is_stream_safe_cuda_allocator_used_ = false;

174
    switch (strategy_) {
175 176
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
J
jianghaicheng 已提交
177 178 179 180 181
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
182
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
183
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
184 185 186
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
187
#endif
188 189 190 191 192
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
193
#ifdef PADDLE_WITH_CUSTOM_DEVICE
194
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
195 196
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
197
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
198 199 200 201 202
               ++dev_id) {
            InitNaiveBestFitCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id));
          }
        }
203
#endif
Z
Zeng Jinle 已提交
204 205
        break;
      }
206 207 208

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
209 210
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        allow_free_idle_chunk_ = allow_free_idle_chunk;
211 212 213 214 215
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                      allow_free_idle_chunk_);
        }

216 217 218 219 220 221 222 223
        // Note(Ruibiao): For GPU multi-stream case without CUDA graph
        // capturing, the 'allocators_' map(place -> Allocator) hold the
        // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
        // directly got from DeviceContex), while the 'cuda_allocators_' map
        // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
        // releate to non-default stream (i.e., the stream users pass in). The
        // default stream Allocator is built in the structure of
        // AllocatorFacadePrivate, while the non-default stream is build in a
224
        // manner in GetAllocator function with 'create_if_not_found = true'.
225 226 227 228
        // We make special treatment for the default stream for performance
        // reasons. Since most Alloc calls are for default stream in
        // application, treating it separately can avoid lots of overhead of
        // acquiring default stream and applying read-write lock.
229
        if (FLAGS_use_stream_safe_cuda_allocator) {
230 231 232 233
          if (LIKELY(!IsCUDAGraphCapturing())) {
            WrapStreamSafeCUDAAllocatorForDefault();
          }
          is_stream_safe_cuda_allocator_used_ = true;
234
        }
235

236 237
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
238 239 240 241
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
J
jianghaicheng 已提交
242 243 244 245 246
#endif
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
F
fwenguang 已提交
247
#endif
248
#ifdef PADDLE_WITH_CUSTOM_DEVICE
249
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
250 251
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
252
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
253 254 255 256 257
               ++dev_id) {
            InitAutoGrowthCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
          }
        }
258
#endif
Z
Zeng Jinle 已提交
259 260
        break;
      }
261

262 263
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
264 265 266 267 268
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
J
jianghaicheng 已提交
269 270 271 272 273
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
274
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
275
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
276 277 278 279 280 281 282
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
        break;
      }

Z
Zeng Jinle 已提交
283
      default: {
284
        PADDLE_THROW(platform::errors::InvalidArgument(
285
            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
Z
Zeng Jinle 已提交
286
      }
Y
Yu Yang 已提交
287
    }
Z
Zeng Jinle 已提交
288
    InitZeroSizeAllocators();
289
    InitSystemAllocators();
290 291 292 293 294

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

295 296
    WrapStatAllocator();

297
    CheckAllocThreadSafe();
298 299

#ifdef PADDLE_WITH_CUDA
300 301 302
    // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
    if (!is_stream_safe_cuda_allocator_used_ &&
        UNLIKELY(IsCUDAGraphCapturing())) {
303 304 305
      WrapCUDAGraphAllocator();
    }
#endif
Z
Zeng Jinle 已提交
306 307 308 309
  }

  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
310
    VLOG(6) << "GetAllocator"
L
Leo Chen 已提交
311
            << " " << place << " " << size;
312 313
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
314
                                                          : GetAllocatorMap())
315
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
316
    auto iter = allocators.find(place);
317 318
    PADDLE_ENFORCE_NE(iter,
                      allocators.end(),
319 320
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
321
    return iter->second;
322 323
  }

324
  void* GetBasePtr(const std::shared_ptr<phi::Allocation>& allocation) {
325 326 327
    return static_cast<Allocation*>(allocation.get())->base_ptr();
  }

328 329 330 331 332
  bool IsStreamSafeCUDAAllocatorUsed() {
    return is_stream_safe_cuda_allocator_used_ &&
           LIKELY(FLAGS_use_system_allocator == false);
  }

333
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
334
  bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
335 336 337 338 339 340 341 342 343
    auto it = cuda_allocators_.find(place);
    if (it == cuda_allocators_.end()) {
      return false;
    }
    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
        it->second;
    return allocator_map.find(stream) != allocator_map.end();
  }

344
  const std::shared_ptr<Allocator>& GetAllocator(
345 346
      const platform::CUDAPlace& place,
      gpuStream_t stream,
347
      bool create_if_not_found = false) {
348 349 350 351 352
    if (LIKELY(!IsCUDAGraphCapturing())) {
      if (stream == GetDefaultStream(place)) {
        VLOG(7) << "Get Allocator by passing in a default stream";
        return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
      }
353 354 355
    }

    /* shared_lock_guard */ {
356 357 358
      std::shared_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      if (LIKELY(HasCUDAAllocator(place, stream))) {
359 360
        return cuda_allocators_[place][stream];
      } else {
361 362
        PADDLE_ENFORCE_NE(create_if_not_found,
                          false,
363 364 365
                          platform::errors::NotFound(
                              "No allocator found for stream %s in place %s "
                              "with create_if_not_found = false",
366 367
                              stream,
                              place));
368 369 370
      }
    }

371
    /* unique_lock_guard */ {
372 373 374 375
      std::unique_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      InitStreamSafeCUDAAllocator(place, stream);
      return cuda_allocators_[place][stream];
376
    }
377 378
  }

379 380 381 382
  const std::shared_ptr<StreamSafeCUDAAllocator>
  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
    const auto iter = default_stream_safe_cuda_allocators_.find(place);
    PADDLE_ENFORCE_NE(
383 384
        iter,
        default_stream_safe_cuda_allocators_.end(),
385 386 387 388 389
        platform::errors::NotFound(
            "No StreamSafeCUDAAllocator found for the place, %s", place));
    return iter->second;
  }

390
  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) const {
391 392 393 394 395
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
    return allocator->GetDefaultStream();
  }

396
  void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream) {
397 398
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
399

400
    PADDLE_ENFORCE_EQ(
401 402
        allocator->GetDefaultStream(),
        nullptr,
403 404 405
        platform::errors::Unavailable(
            "The default stream for StreamSafeCUDAAllocator(%p) in %s has been "
            "set to %p, not allow to change it to %p.",
406 407 408 409
            allocator.get(),
            place,
            allocator->GetDefaultStream(),
            stream));
410

411 412 413 414 415 416
    allocator->SetDefaultStream(stream);
    VLOG(8) << "Set default stream to " << stream
            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
            << place;
  }

417
  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
418
                    gpuStream_t stream) {
419 420 421 422 423 424
    std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
        std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
    if (stream_safe_cuda_allocation != nullptr) {
      stream_safe_cuda_allocation->RecordStream(stream);
    } else {
      VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
425
    }
426 427
  }

428
  gpuStream_t GetStream(
429
      const std::shared_ptr<phi::Allocation>& allocation) const {
430 431 432 433 434 435 436 437 438 439 440
    const std::shared_ptr<StreamSafeCUDAAllocation>
        stream_safe_cuda_allocation =
            std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
    if (stream_safe_cuda_allocation != nullptr) {
      return stream_safe_cuda_allocation->GetOwningStream();
    }

    VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
    return static_cast<phi::GPUContext*>(
               platform::DeviceContextPool::Instance().Get(allocation->place()))
        ->stream();
441 442 443 444 445 446 447 448 449 450
  }
#endif

 private:
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
    bool IsAllocThreadSafe() const override { return true; }

   protected:
451
    phi::Allocation* AllocateImpl(size_t size) override {
452 453
      return new Allocation(nullptr, 0, place_);
    }
454
    void FreeImpl(phi::Allocation* allocation) override { delete allocation; }
455 456 457 458 459

   private:
    platform::Place place_;
  };

460
  const AllocatorMap& GetAllocatorMap() { return allocators_; }
461

462
  void InitNaiveBestFitCPUAllocator() {
463 464 465 466 467 468 469
#if defined(__APPLE__) && defined(__arm64__)
    // NOTE(wuweilong): It is more efficient to use CPUAllocator directly,
    // but it wll cause some problem in Mac OS m1 chip, so we use
    // NaiveBestFitAllocator instead.
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
#else
470
    allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
471
#endif
Y
Yu Yang 已提交
472 473
  }

474
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
475 476 477
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
478 479
  }

480 481 482 483 484 485 486 487
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  // Create a new CUDAAllocator or CUDAManagedAllocator for the given device
  std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
    if (FLAGS_use_cuda_managed_memory) {
      PADDLE_ENFORCE_EQ(
488 489
          strategy_,
          AllocatorStrategy::kAutoGrowth,
490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
          platform::errors::InvalidArgument(
              "CUDA managed memory is only implemented for auto_growth "
              "strategy, not support %s strategy.\n"
              "Please use auto_growth strategy by command `export "
              "FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
              "memory by command `export FLAGS_use_cuda_managed_memory=false`",
              FLAGS_allocator_strategy));

      if (!platform::IsGPUManagedMemorySupported(p.device)) {
        PADDLE_THROW(platform::errors::Unavailable(
            "Failed to create CUDAManagedAllocator on GPU %d.\n\n"
            "You have enabled CUDA managed memory, but the gpu device does not "
            "support allocating managed memory.\n"
            "If you don't actually need to use managed memory, please disable "
            "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
505 506
            "Or you must use the gpu device that supports managed memory.",
            p.device));
507 508 509 510 511 512
      }
      return std::make_shared<CUDAManagedAllocator>(p);
    }
    return std::make_shared<CUDAAllocator>(p);
  }

513 514
  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    PADDLE_ENFORCE_EQ(
515 516
        strategy_,
        AllocatorStrategy::kAutoGrowth,
517 518 519 520
        platform::errors::Unimplemented(
            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
            "the allocator strategy %d is unsupported for multi-stream",
            static_cast<int>(strategy_)));
521 522 523
    if (LIKELY(!HasCUDAAllocator(p, stream))) {
      VLOG(8) << "Init CUDA allocator for stream " << stream << " in place "
              << p;
524 525 526
      InitAutoGrowthCUDAAllocator(p, stream);
      WrapStreamSafeCUDAAllocator(p, stream);
      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
527
      WrapStatAllocator(p, stream);
528 529 530 531
    }
  }

  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
532 533 534
    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
            << FLAGS_auto_growth_chunk_size_in_mb;
535
#if defined(PADDLE_WITH_HIP)
536
    auto cuda_allocator = CreateCUDAAllocator(p);
537
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
538 539 540 541
        cuda_allocator,
        platform::GpuMinChunkSize(),
        chunk_size,
        allow_free_idle_chunk_);
542 543 544 545 546 547 548
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
549
      PADDLE_ENFORCE_GPU_SUCCESS(
550 551
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

552
      PADDLE_ENFORCE_GPU_SUCCESS(
553
          paddle::platform::dynload::cuDeviceGetAttribute(
554 555
              &val,
              CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
556 557 558 559 560 561 562 563 564 565 566
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      cuda_allocators_[p][stream] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
567
      auto cuda_allocator = CreateCUDAAllocator(p);
568 569
      cuda_allocators_[p][stream] =
          std::make_shared<AutoGrowthBestFitAllocator>(
570 571
              cuda_allocator,
              platform::GpuMinChunkSize(),
572
              /*chunk_size=*/chunk_size,
573 574 575
              allow_free_idle_chunk_);
    }
#else
576
    auto cuda_allocator = CreateCUDAAllocator(p);
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }

    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
610
        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_);
611 612
#endif
#endif
613 614
  }

615
  // NOTE(Ruibiao): Old single-stream version, will be removed later
616 617
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
618 619 620
    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
            << FLAGS_auto_growth_chunk_size_in_mb;
621
#if defined(PADDLE_WITH_HIP)
622
    auto cuda_allocator = CreateCUDAAllocator(p);
623
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
624 625
        cuda_allocator,
        platform::GpuMinChunkSize(),
626
        /*chunk_size=*/chunk_size,
627
        allow_free_idle_chunk);
628 629 630 631 632 633 634
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
635
      PADDLE_ENFORCE_GPU_SUCCESS(
636 637
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

638
      PADDLE_ENFORCE_GPU_SUCCESS(
639
          paddle::platform::dynload::cuDeviceGetAttribute(
640 641
              &val,
              CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
642 643 644 645 646 647 648 649 650 651 652
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      allocators_[p] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
653
      auto cuda_allocator = CreateCUDAAllocator(p);
654
      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
655 656
          cuda_allocator,
          platform::GpuMinChunkSize(),
657
          /*chunk_size=*/chunk_size,
658
          allow_free_idle_chunk);
659 660 661
    }

#else
662
    auto cuda_allocator = CreateCUDAAllocator(p);
L
Leo Chen 已提交
663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }
694
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
695
        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
696 697
#endif
#endif
S
sneaxiy 已提交
698
  }
699 700 701 702 703 704

  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
705 706
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StreamSafeCUDAAllocator>(
707 708 709
        allocator,
        p,
        stream,
710
        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
711 712
  }

713 714 715 716 717 718
  void WrapStreamSafeCUDAAllocatorForDefault() {
    for (auto& pair : allocators_) {
      auto& place = pair.first;
      if (platform::is_gpu_place(place)) {
        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
            std::make_shared<StreamSafeCUDAAllocator>(
719 720
                pair.second,
                place,
721
                /* default_stream = */ nullptr,
722 723 724 725 726 727 728 729 730 731 732 733 734
                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
        pair.second = allocator;

        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
        // ability to interact with the outside world, i.e., change default
        // stream from outside
        default_stream_safe_cuda_allocators_[place] = allocator;
        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
                << ", allocator address = " << pair.second.get();
      }
    }
  }

735 736
  void WrapCUDARetryAllocator(platform::CUDAPlace p,
                              gpuStream_t stream,
737 738
                              size_t retry_time) {
    PADDLE_ENFORCE_GT(
739 740
        retry_time,
        0,
741 742
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
743
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
744 745 746
    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
  }

747 748 749 750 751
  void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StatAllocator>(allocator);
  }

752 753 754 755 756 757 758 759 760
#ifdef PADDLE_WITH_CUDA
  void WrapCUDAGraphAllocator() {
    for (auto& item : allocators_) {
      auto& allocator = item.second;
      allocator = CUDAGraphAllocator::Create(allocator);
    }
  }
#endif

761 762 763
  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
    for (auto& place_pair : allocators) {
      for (auto& stream_pair : place_pair.second) {
764 765
        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(),
                          true,
766 767 768 769 770
                          platform::errors::InvalidArgument(
                              "Public allocators must be thread safe"));
      }
    }
  }
771
#endif
S
sneaxiy 已提交
772

773 774 775 776 777 778
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

J
jianghaicheng 已提交
779 780 781 782 783 784
#ifdef PADDLE_WITH_IPU
  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

785 786 787 788 789 790 791
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
                                           bool allow_free_idle_chunk) {
792
    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
793 794 795
    auto custom_allocator =
        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
796 797
        custom_allocator,
        phi::DeviceManager::GetMinChunkSize(p),
798
        /*chunk_size=*/chunk_size,
799 800 801 802
        allow_free_idle_chunk);
  }
#endif

803 804 805 806 807 808 809 810
  void InitSystemAllocators() {
    if (!system_allocators_.empty()) return;
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
Z
Zeng Jinle 已提交
811
    }
812
#endif
J
jianghaicheng 已提交
813 814 815 816 817 818 819
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::IPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
820 821 822
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
823
    int device_count = platform::GetGPUDeviceCount();
824 825
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
826
      system_allocators_[p] = CreateCUDAAllocator(p);
827
    }
F
fwenguang 已提交
828
#endif
829 830 831 832
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
833 834
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
           dev_id++) {
835 836 837 838
        platform::CustomPlace p(dev_type, dev_id);
        system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
      }
    }
839 840
#endif
  }
Z
Zeng Jinle 已提交
841 842

  void InitZeroSizeAllocators() {
843
    if (!zero_size_allocators_.empty()) return;
Z
Zeng Jinle 已提交
844 845
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
846
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
847
    int device_count = platform::GetGPUDeviceCount();
Z
Zeng Jinle 已提交
848 849 850 851 852
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
853 854 855 856 857 858
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
J
jianghaicheng 已提交
859 860 861 862 863 864
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::IPUPlace(dev_id));
    }
#endif
865
#ifdef PADDLE_WITH_CUSTOM_DEVICE
866
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
867 868
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
869 870
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
           dev_id++) {
871 872 873 874
        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
      }
    }
#endif
Z
Zeng Jinle 已提交
875 876 877

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
878 879
    }
  }
Z
Zeng Jinle 已提交
880

881 882
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
883 884
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(),
                        true,
885 886
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
887
    }
888
  }
889

890 891 892 893
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
894
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
895
    if (is_stream_safe_cuda_allocator_used_) {
896 897 898
      CheckCUDAAllocThreadSafe(cuda_allocators_);
    }
#endif
899 900 901
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
902
    PADDLE_ENFORCE_GT(
903 904
        retry_time,
        0,
905 906
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
907 908 909 910 911 912 913
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

914 915
  void WrapStatAllocator() {
    for (auto& pair : allocators_) {
916 917 918 919 920 921 922
      // Now memory stats is only supported for CPU and GPU
      const platform::Place& place = pair.first;
      if (platform::is_cpu_place(place) ||
          platform::is_cuda_pinned_place(place) ||
          platform::is_gpu_place(place)) {
        pair.second = std::make_shared<StatAllocator>(pair.second);
      }
923 924 925
    }
  }

926 927
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // a standalone CUDA allocator to support multi-stream GC in new executor
928 929
  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
      default_stream_safe_cuda_allocators_;
930
  CUDAAllocatorMap cuda_allocators_;
931
  std::shared_timed_mutex cuda_allocator_mutex_;
932 933
#endif
  AllocatorStrategy strategy_;
934
  AllocatorMap allocators_;
935 936
  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
937
  bool allow_free_idle_chunk_;
938
  bool is_stream_safe_cuda_allocator_used_;
939
};
940 941 942 943
AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

Y
Refine  
Yu Yang 已提交
944
// Pimpl. Make interface clean.
945
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
946 947 948
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
949 950

AllocatorFacade& AllocatorFacade::Instance() {
951 952 953 954 955 956
  static AllocatorFacade* instance = new AllocatorFacade;
  return *instance;
}

AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
957
  if (UNLIKELY(IsCUDAGraphCapturing())) {
958
    auto id = phi::backends::gpu::CUDAGraph::CapturingPoolID();
959 960
    auto iter = cuda_graph_map_.find(id);
    PADDLE_ENFORCE_NE(
961 962
        iter,
        cuda_graph_map_.end(),
963 964 965 966 967 968 969
        platform::errors::PermissionDenied(
            "No memory pool is prepared for CUDA Graph capturing."));
    VLOG(10) << "Choose CUDA Graph memory pool";
    return iter->second.get();
  }
#endif
  return m_;
970 971
}

972 973
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
974 975
  return GetPrivate()->GetAllocator(
      place, /* A non-zero num to choose allocator_ */ 1);
976 977
}

978
void* AllocatorFacade::GetBasePtr(
979
    const std::shared_ptr<phi::Allocation>& allocation) {
980 981
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
                    AllocatorStrategy::kAutoGrowth,
982 983 984 985
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for auto_growth "
                        "strategy, not support allocator strategy: %d",
                        static_cast<int>(GetAllocatorStrategy())));
986 987
  PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()),
                    true,
988 989 990 991
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for CUDAPlace(), not "
                        "suppot place: %s",
                        allocation->place()));
992
  return GetPrivate()->GetBasePtr(allocation);
993 994
}

995 996
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
    const platform::Place& place) {
997
  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
998 999
}

1000
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
1001
    const platform::Place& place, size_t size) {
1002
  return std::shared_ptr<phi::Allocation>(Alloc(place, size));
1003 1004
}

1005 1006
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
1007
  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
1008 1009
}

W
Wilber 已提交
1010
uint64_t AllocatorFacade::Release(const platform::Place& place) {
1011 1012
  return GetPrivate()
      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
1013 1014 1015
      ->Release(place);
}

1016 1017
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const phi::Stream& stream) {
1018
  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
1019 1020
}

1021 1022
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size,
1023
                                     const phi::Stream& stream) {
1024
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1025 1026 1027 1028 1029
  AllocatorFacadePrivate* m = GetPrivate();
  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return Alloc(place, size);
  }
1030

1031 1032 1033
  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
1034
    return m->GetAllocator(p, s, /* create_if_not_found = */ true)
1035 1036
        ->Allocate(size);
  } else {
1037
    return m->GetAllocator(p, size)->Allocate(size);
1038
  }
1039
#elif defined(PADDLE_WITH_XPU)
1040
  return GetAllocator(place)->Allocate(size);
1041
#else
1042 1043
  PADDLE_THROW(platform::errors::PreconditionNotMet(
      "Not compiled with GPU or XPU or NPU."));
1044 1045 1046
#endif
}

1047 1048 1049
bool AllocatorFacade::InSameStream(
    const std::shared_ptr<phi::Allocation>& allocation,
    const phi::Stream& stream) {
1050
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1051 1052 1053 1054
  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
  return s == GetStream(allocation);
#else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
1055
#endif
1056 1057
}

1058 1059 1060 1061
bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
  return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
}

1062
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1063
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
1064
                                  gpuStream_t stream) {
1065 1066 1067 1068 1069 1070 1071
  AllocatorFacadePrivate* m = GetPrivate();
  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return Release(place);
  }

  return m->GetAllocator(place, stream)->Release(place);
1072 1073
}

1074
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
1075
                                   gpuStream_t stream) {
1076
  GetPrivate()->RecordStream(allocation, stream);
1077 1078
}

1079
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
1080
    const platform::Place& place, gpuStream_t stream) {
1081 1082 1083 1084 1085
  AllocatorFacadePrivate* m = GetPrivate();

  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return GetAllocator(place);
1086
  }
1087 1088

  if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
1089 1090
    return m->GetAllocator(place,
                           stream,
1091 1092 1093
                           /*create_if_not_found=*/true);
  }
  return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
1094 1095
}

1096
gpuStream_t AllocatorFacade::GetStream(
1097
    const std::shared_ptr<phi::Allocation>& allocation) const {
1098
  return GetPrivate()->GetStream(allocation);
1099 1100
}

1101
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
1102
                                       gpuStream_t stream) {
1103 1104
  if (m_->IsStreamSafeCUDAAllocatorUsed()) {
    m_->SetDefaultStream(place, stream);
1105 1106 1107
  }
}

1108
#ifdef PADDLE_WITH_CUDA
1109
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
1110 1111
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
                    AllocatorStrategy::kAutoGrowth,
1112 1113 1114 1115 1116 1117
                    platform::errors::InvalidArgument(
                        "CUDA Graph is only supported when the "
                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
                        "FLAGS_allocator_strategy=\"%s\"",
                        FLAGS_allocator_strategy));
  auto& allocator = cuda_graph_map_[id];
1118 1119 1120 1121 1122 1123 1124 1125 1126
  auto& ref_cnt = cuda_graph_ref_cnt_[id];
  if (allocator.get() == nullptr) {
    allocator.reset(
        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
    VLOG(10) << "Create memory pool for CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Use created memory pool for CUDA Graph with memory ID " << id;
  }
  ++ref_cnt;
1127 1128
}

1129 1130
void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
  auto ref_cnt_iter = cuda_graph_ref_cnt_.find(id);
1131 1132
  PADDLE_ENFORCE_NE(ref_cnt_iter,
                    cuda_graph_ref_cnt_.end(),
1133
                    platform::errors::InvalidArgument(
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
                        "Cannot find CUDA Graph with memory ID = %d", id));
  auto& ref_cnt = ref_cnt_iter->second;
  --ref_cnt;
  if (ref_cnt == 0) {
    cuda_graph_map_.erase(id);
    cuda_graph_ref_cnt_.erase(ref_cnt_iter);
    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
             << ref_cnt;
  }
1145 1146
}
#endif
1147
#endif
1148 1149 1150 1151

UNUSED static std::shared_ptr<NaiveBestFitAllocator> unused_obj =
    std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());

1152 1153 1154
}  // namespace allocation
}  // namespace memory
}  // namespace paddle