allocator_facade.cc 43.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "gflags/gflags.h"
18
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
19
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
20
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
21
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
22
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
23
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
S
sneaxiy 已提交
24
#include "paddle/fluid/memory/allocation/retry_allocator.h"
25
#include "paddle/fluid/memory/allocation/stat_allocator.h"
S
sneaxiy 已提交
26
#include "paddle/fluid/platform/enforce.h"
27
#include "paddle/fluid/platform/place.h"
28
#include "paddle/phi/core/macros.h"
29

30
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
31
#include <shared_mutex>
32

33
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
34
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
S
sneaxiy 已提交
35
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
36
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
37
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
38
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
39
#include "paddle/fluid/platform/device_context.h"
40
#include "paddle/phi/backends/gpu/gpu_context.h"
41 42

#ifdef PADDLE_WITH_CUDA
43
#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
44
#endif
45

46 47 48 49 50
#if CUDA_VERSION >= 10020
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
51
#endif
52

53
#ifdef PADDLE_WITH_XPU
54
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
55
#endif
56

J
jianghaicheng 已提交
57 58 59 60
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif

F
fwenguang 已提交
61 62 63 64
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif

65 66 67 68 69
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#endif

Z
Zeng Jinle 已提交
70
PADDLE_DEFINE_EXPORTED_int64(
71 72
    gpu_allocator_retry_time,
    10000,
S
sneaxiy 已提交
73 74 75
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
76
PADDLE_DEFINE_EXPORTED_bool(
77 78
    use_system_allocator,
    false,
Z
Zeng Jinle 已提交
79 80
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
81

82 83
PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth,
                            false,
84 85
                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");

86 87 88
// NOTE(Ruibiao): This FLAGS is just to be compatibled with
// the old single-stream CUDA allocator. It will be removed
// after StreamSafeCudaAllocator has been fully tested.
89 90
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator,
                            true,
91 92
                            "Enable StreamSafeCUDAAllocator");

93 94
PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
                            false,
95 96 97 98
                            "Whether to use CUDAManagedAllocator to allocate "
                            "managed memory, only available for auto_growth "
                            "strategy");

99
DECLARE_string(allocator_strategy);
100
DECLARE_uint64(auto_growth_chunk_size_in_mb);
101

102 103 104 105
namespace paddle {
namespace memory {
namespace allocation {

106 107 108 109 110 111 112 113
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
    : public Allocator,
      public std::enable_shared_from_this<CUDAGraphAllocator> {
 private:
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
114
                      DecoratedAllocationPtr underlying_allocation)
115 116 117 118
        : Allocation(underlying_allocation->ptr(),
                     underlying_allocation->base_ptr(),
                     underlying_allocation->size(),
                     underlying_allocation->place()),
119 120 121 122 123
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}

   private:
    std::shared_ptr<Allocator> allocator_;
124
    DecoratedAllocationPtr underlying_allocation_;
125 126 127 128 129 130
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
      : underlying_allocator_(allocator) {}

 public:
131 132
  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }

133 134 135 136 137 138
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
  }

 protected:
139
  phi::Allocation* AllocateImpl(size_t size) {
140
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
141 142 143
    return new PrivateAllocation(this,
                                 static_unique_ptr_cast<Allocation>(
                                     underlying_allocator_->Allocate(size)));
144 145
  }

146
  void FreeImpl(phi::Allocation* allocation) {
147 148 149 150 151 152 153 154 155
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }

 private:
  std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

156 157
static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
158
  return UNLIKELY(phi::backends::gpu::CUDAGraph::IsThisThreadCapturing());
159 160 161 162 163
#else
  return false;
#endif
}

Y
Yu Yang 已提交
164 165
class AllocatorFacadePrivate {
 public:
166 167
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

168 169 170 171 172 173
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  using CUDAAllocatorMap =
      std::map<platform::CUDAPlace,
               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
#endif

174 175
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
176 177
    is_stream_safe_cuda_allocator_used_ = false;

178
    switch (strategy_) {
179 180
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
J
jianghaicheng 已提交
181 182 183 184 185
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
186
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
187
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
188 189 190
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
191
#endif
192 193 194 195 196
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
F
fwenguang 已提交
197 198 199 200
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
201 202
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
203
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
204 205
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
206
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
207 208 209 210 211
               ++dev_id) {
            InitNaiveBestFitCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id));
          }
        }
212
#endif
Z
Zeng Jinle 已提交
213 214
        break;
      }
215 216 217

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
218 219
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        allow_free_idle_chunk_ = allow_free_idle_chunk;
220 221 222 223 224
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                      allow_free_idle_chunk_);
        }

225 226 227 228 229 230 231 232
        // Note(Ruibiao): For GPU multi-stream case without CUDA graph
        // capturing, the 'allocators_' map(place -> Allocator) hold the
        // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
        // directly got from DeviceContex), while the 'cuda_allocators_' map
        // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
        // releate to non-default stream (i.e., the stream users pass in). The
        // default stream Allocator is built in the structure of
        // AllocatorFacadePrivate, while the non-default stream is build in a
233
        // manner in GetAllocator function with 'create_if_not_found = true'.
234 235 236 237
        // We make special treatment for the default stream for performance
        // reasons. Since most Alloc calls are for default stream in
        // application, treating it separately can avoid lots of overhead of
        // acquiring default stream and applying read-write lock.
238
        if (FLAGS_use_stream_safe_cuda_allocator) {
239 240 241 242
          if (LIKELY(!IsCUDAGraphCapturing())) {
            WrapStreamSafeCUDAAllocatorForDefault();
          }
          is_stream_safe_cuda_allocator_used_ = true;
243
        }
244

245 246
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
247 248 249 250
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
J
jianghaicheng 已提交
251 252 253 254 255
#endif
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
F
fwenguang 已提交
256 257 258 259 260
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
261 262
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
263
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
264 265
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
266
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
267 268 269 270 271
               ++dev_id) {
            InitAutoGrowthCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
          }
        }
272
#endif
Z
Zeng Jinle 已提交
273 274
        break;
      }
275

276 277
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
278 279 280 281 282
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
J
jianghaicheng 已提交
283 284 285 286 287
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
288
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
289
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
290 291 292
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
F
fwenguang 已提交
293 294 295 296 297
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
298 299 300 301
#endif
        break;
      }

Z
Zeng Jinle 已提交
302
      default: {
303
        PADDLE_THROW(platform::errors::InvalidArgument(
304
            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
Z
Zeng Jinle 已提交
305
      }
Y
Yu Yang 已提交
306
    }
Z
Zeng Jinle 已提交
307
    InitZeroSizeAllocators();
308
    InitSystemAllocators();
309 310 311 312 313

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

314 315
    WrapStatAllocator();

316
    CheckAllocThreadSafe();
317 318

#ifdef PADDLE_WITH_CUDA
319 320 321
    // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
    if (!is_stream_safe_cuda_allocator_used_ &&
        UNLIKELY(IsCUDAGraphCapturing())) {
322 323 324
      WrapCUDAGraphAllocator();
    }
#endif
Z
Zeng Jinle 已提交
325 326 327 328
  }

  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
329
    VLOG(6) << "GetAllocator"
L
Leo Chen 已提交
330
            << " " << place << " " << size;
331 332
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
333
                                                          : GetAllocatorMap())
334
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
335
    auto iter = allocators.find(place);
336 337
    PADDLE_ENFORCE_NE(iter,
                      allocators.end(),
338 339
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
340
    return iter->second;
341 342
  }

343
  void* GetBasePtr(const std::shared_ptr<phi::Allocation>& allocation) {
344 345 346
    return static_cast<Allocation*>(allocation.get())->base_ptr();
  }

347 348 349 350 351
  bool IsStreamSafeCUDAAllocatorUsed() {
    return is_stream_safe_cuda_allocator_used_ &&
           LIKELY(FLAGS_use_system_allocator == false);
  }

352
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
353
  bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
354 355 356 357 358 359 360 361 362
    auto it = cuda_allocators_.find(place);
    if (it == cuda_allocators_.end()) {
      return false;
    }
    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
        it->second;
    return allocator_map.find(stream) != allocator_map.end();
  }

363
  const std::shared_ptr<Allocator>& GetAllocator(
364 365
      const platform::CUDAPlace& place,
      gpuStream_t stream,
366
      bool create_if_not_found = false) {
367 368 369 370 371
    if (LIKELY(!IsCUDAGraphCapturing())) {
      if (stream == GetDefaultStream(place)) {
        VLOG(7) << "Get Allocator by passing in a default stream";
        return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
      }
372 373 374
    }

    /* shared_lock_guard */ {
375 376 377
      std::shared_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      if (LIKELY(HasCUDAAllocator(place, stream))) {
378 379
        return cuda_allocators_[place][stream];
      } else {
380 381
        PADDLE_ENFORCE_NE(create_if_not_found,
                          false,
382 383 384
                          platform::errors::NotFound(
                              "No allocator found for stream %s in place %s "
                              "with create_if_not_found = false",
385 386
                              stream,
                              place));
387 388 389
      }
    }

390
    /* unique_lock_guard */ {
391 392 393 394
      std::unique_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      InitStreamSafeCUDAAllocator(place, stream);
      return cuda_allocators_[place][stream];
395
    }
396 397
  }

398 399 400 401
  const std::shared_ptr<StreamSafeCUDAAllocator>
  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
    const auto iter = default_stream_safe_cuda_allocators_.find(place);
    PADDLE_ENFORCE_NE(
402 403
        iter,
        default_stream_safe_cuda_allocators_.end(),
404 405 406 407 408
        platform::errors::NotFound(
            "No StreamSafeCUDAAllocator found for the place, %s", place));
    return iter->second;
  }

409
  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) const {
410 411 412 413 414
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
    return allocator->GetDefaultStream();
  }

415
  void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream) {
416 417
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
418

419
    PADDLE_ENFORCE_EQ(
420 421
        allocator->GetDefaultStream(),
        nullptr,
422 423 424
        platform::errors::Unavailable(
            "The default stream for StreamSafeCUDAAllocator(%p) in %s has been "
            "set to %p, not allow to change it to %p.",
425 426 427 428
            allocator.get(),
            place,
            allocator->GetDefaultStream(),
            stream));
429

430 431 432 433 434 435
    allocator->SetDefaultStream(stream);
    VLOG(8) << "Set default stream to " << stream
            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
            << place;
  }

436
  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
437
                    gpuStream_t stream) {
438 439 440 441 442 443
    std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
        std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
    if (stream_safe_cuda_allocation != nullptr) {
      stream_safe_cuda_allocation->RecordStream(stream);
    } else {
      VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
444
    }
445 446
  }

447
  gpuStream_t GetStream(
448
      const std::shared_ptr<phi::Allocation>& allocation) const {
449 450 451 452 453 454 455 456 457 458 459
    const std::shared_ptr<StreamSafeCUDAAllocation>
        stream_safe_cuda_allocation =
            std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
    if (stream_safe_cuda_allocation != nullptr) {
      return stream_safe_cuda_allocation->GetOwningStream();
    }

    VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
    return static_cast<phi::GPUContext*>(
               platform::DeviceContextPool::Instance().Get(allocation->place()))
        ->stream();
460 461 462 463 464 465 466 467 468 469
  }
#endif

 private:
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
    bool IsAllocThreadSafe() const override { return true; }

   protected:
470
    phi::Allocation* AllocateImpl(size_t size) override {
471 472
      return new Allocation(nullptr, 0, place_);
    }
473
    void FreeImpl(phi::Allocation* allocation) override { delete allocation; }
474 475 476 477 478

   private:
    platform::Place place_;
  };

479
  const AllocatorMap& GetAllocatorMap() { return allocators_; }
480

481
  void InitNaiveBestFitCPUAllocator() {
482 483 484 485 486 487 488
#if defined(__APPLE__) && defined(__arm64__)
    // NOTE(wuweilong): It is more efficient to use CPUAllocator directly,
    // but it wll cause some problem in Mac OS m1 chip, so we use
    // NaiveBestFitAllocator instead.
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
#else
489
    allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
490
#endif
Y
Yu Yang 已提交
491 492
  }

493
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
494 495 496
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
497 498
  }

499 500 501 502 503 504 505 506
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  // Create a new CUDAAllocator or CUDAManagedAllocator for the given device
  std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
    if (FLAGS_use_cuda_managed_memory) {
      PADDLE_ENFORCE_EQ(
507 508
          strategy_,
          AllocatorStrategy::kAutoGrowth,
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523
          platform::errors::InvalidArgument(
              "CUDA managed memory is only implemented for auto_growth "
              "strategy, not support %s strategy.\n"
              "Please use auto_growth strategy by command `export "
              "FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
              "memory by command `export FLAGS_use_cuda_managed_memory=false`",
              FLAGS_allocator_strategy));

      if (!platform::IsGPUManagedMemorySupported(p.device)) {
        PADDLE_THROW(platform::errors::Unavailable(
            "Failed to create CUDAManagedAllocator on GPU %d.\n\n"
            "You have enabled CUDA managed memory, but the gpu device does not "
            "support allocating managed memory.\n"
            "If you don't actually need to use managed memory, please disable "
            "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
524 525
            "Or you must use the gpu device that supports managed memory.",
            p.device));
526 527 528 529 530 531
      }
      return std::make_shared<CUDAManagedAllocator>(p);
    }
    return std::make_shared<CUDAAllocator>(p);
  }

532 533
  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    PADDLE_ENFORCE_EQ(
534 535
        strategy_,
        AllocatorStrategy::kAutoGrowth,
536 537 538 539
        platform::errors::Unimplemented(
            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
            "the allocator strategy %d is unsupported for multi-stream",
            static_cast<int>(strategy_)));
540 541 542
    if (LIKELY(!HasCUDAAllocator(p, stream))) {
      VLOG(8) << "Init CUDA allocator for stream " << stream << " in place "
              << p;
543 544 545
      InitAutoGrowthCUDAAllocator(p, stream);
      WrapStreamSafeCUDAAllocator(p, stream);
      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
546
      WrapStatAllocator(p, stream);
547 548 549 550
    }
  }

  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
551 552 553
    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
            << FLAGS_auto_growth_chunk_size_in_mb;
554
#if defined(PADDLE_WITH_HIP)
555
    auto cuda_allocator = CreateCUDAAllocator(p);
556
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
557 558 559 560
        cuda_allocator,
        platform::GpuMinChunkSize(),
        chunk_size,
        allow_free_idle_chunk_);
561 562 563 564 565 566 567
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
568
      PADDLE_ENFORCE_GPU_SUCCESS(
569 570
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

571
      PADDLE_ENFORCE_GPU_SUCCESS(
572
          paddle::platform::dynload::cuDeviceGetAttribute(
573 574
              &val,
              CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
575 576 577 578 579 580 581 582 583 584 585
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      cuda_allocators_[p][stream] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
586
      auto cuda_allocator = CreateCUDAAllocator(p);
587 588
      cuda_allocators_[p][stream] =
          std::make_shared<AutoGrowthBestFitAllocator>(
589 590
              cuda_allocator,
              platform::GpuMinChunkSize(),
591
              /*chunk_size=*/chunk_size,
592 593 594
              allow_free_idle_chunk_);
    }
#else
595
    auto cuda_allocator = CreateCUDAAllocator(p);
596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }

    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
629
        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_);
630 631
#endif
#endif
632 633
  }

634
  // NOTE(Ruibiao): Old single-stream version, will be removed later
635 636
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
637 638 639
    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
            << FLAGS_auto_growth_chunk_size_in_mb;
640
#if defined(PADDLE_WITH_HIP)
641
    auto cuda_allocator = CreateCUDAAllocator(p);
642
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
643 644
        cuda_allocator,
        platform::GpuMinChunkSize(),
645
        /*chunk_size=*/chunk_size,
646
        allow_free_idle_chunk);
647 648 649 650 651 652 653
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
654
      PADDLE_ENFORCE_GPU_SUCCESS(
655 656
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

657
      PADDLE_ENFORCE_GPU_SUCCESS(
658
          paddle::platform::dynload::cuDeviceGetAttribute(
659 660
              &val,
              CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
661 662 663 664 665 666 667 668 669 670 671
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      allocators_[p] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
672
      auto cuda_allocator = CreateCUDAAllocator(p);
673
      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
674 675
          cuda_allocator,
          platform::GpuMinChunkSize(),
676
          /*chunk_size=*/chunk_size,
677
          allow_free_idle_chunk);
678 679 680
    }

#else
681
    auto cuda_allocator = CreateCUDAAllocator(p);
L
Leo Chen 已提交
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }
713
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
714
        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
715 716
#endif
#endif
S
sneaxiy 已提交
717
  }
718 719 720 721 722 723

  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
724 725
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StreamSafeCUDAAllocator>(
726 727 728
        allocator,
        p,
        stream,
729
        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
730 731
  }

732 733 734 735 736 737
  void WrapStreamSafeCUDAAllocatorForDefault() {
    for (auto& pair : allocators_) {
      auto& place = pair.first;
      if (platform::is_gpu_place(place)) {
        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
            std::make_shared<StreamSafeCUDAAllocator>(
738 739
                pair.second,
                place,
740
                /* default_stream = */ nullptr,
741 742 743 744 745 746 747 748 749 750 751 752 753
                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
        pair.second = allocator;

        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
        // ability to interact with the outside world, i.e., change default
        // stream from outside
        default_stream_safe_cuda_allocators_[place] = allocator;
        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
                << ", allocator address = " << pair.second.get();
      }
    }
  }

754 755
  void WrapCUDARetryAllocator(platform::CUDAPlace p,
                              gpuStream_t stream,
756 757
                              size_t retry_time) {
    PADDLE_ENFORCE_GT(
758 759
        retry_time,
        0,
760 761
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
762
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
763 764 765
    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
  }

766 767 768 769 770
  void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StatAllocator>(allocator);
  }

771 772 773 774 775 776 777 778 779
#ifdef PADDLE_WITH_CUDA
  void WrapCUDAGraphAllocator() {
    for (auto& item : allocators_) {
      auto& allocator = item.second;
      allocator = CUDAGraphAllocator::Create(allocator);
    }
  }
#endif

780 781 782
  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
    for (auto& place_pair : allocators) {
      for (auto& stream_pair : place_pair.second) {
783 784
        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(),
                          true,
785 786 787 788 789
                          platform::errors::InvalidArgument(
                              "Public allocators must be thread safe"));
      }
    }
  }
790
#endif
S
sneaxiy 已提交
791

792 793 794 795 796 797
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

J
jianghaicheng 已提交
798 799 800 801 802 803
#ifdef PADDLE_WITH_IPU
  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

F
fwenguang 已提交
804 805 806 807 808 809
#ifdef PADDLE_WITH_MLU
  void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

810 811 812 813 814 815 816
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
                                           bool allow_free_idle_chunk) {
817
    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
818 819 820
    auto custom_allocator =
        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
821 822
        custom_allocator,
        phi::DeviceManager::GetMinChunkSize(p),
823
        /*chunk_size=*/chunk_size,
824 825 826 827
        allow_free_idle_chunk);
  }
#endif

828 829 830 831 832 833 834 835
  void InitSystemAllocators() {
    if (!system_allocators_.empty()) return;
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
Z
Zeng Jinle 已提交
836
    }
837
#endif
J
jianghaicheng 已提交
838 839 840 841 842 843 844
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::IPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
845 846 847
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
848
    int device_count = platform::GetGPUDeviceCount();
849 850
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
851
      system_allocators_[p] = CreateCUDAAllocator(p);
852
    }
F
fwenguang 已提交
853 854 855 856
#endif
#ifdef PADDLE_WITH_MLU
    int device_count = platform::GetMLUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
857
      platform::MLUPlace p(i);
F
fwenguang 已提交
858 859
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
860 861 862 863 864
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
865 866
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
           dev_id++) {
867 868 869 870
        platform::CustomPlace p(dev_type, dev_id);
        system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
      }
    }
871 872
#endif
  }
Z
Zeng Jinle 已提交
873 874

  void InitZeroSizeAllocators() {
875
    if (!zero_size_allocators_.empty()) return;
Z
Zeng Jinle 已提交
876 877
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
878
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
879
    int device_count = platform::GetGPUDeviceCount();
Z
Zeng Jinle 已提交
880 881 882 883 884
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
885 886 887 888 889 890
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
J
jianghaicheng 已提交
891 892 893 894 895 896
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::IPUPlace(dev_id));
    }
#endif
F
fwenguang 已提交
897 898 899 900 901 902
#ifdef PADDLE_WITH_MLU
    int device_count = platform::GetMLUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::MLUPlace(dev_id));
    }
#endif
903
#ifdef PADDLE_WITH_CUSTOM_DEVICE
904
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
905 906
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
907 908
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
           dev_id++) {
909 910 911 912
        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
      }
    }
#endif
Z
Zeng Jinle 已提交
913 914 915

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
916 917
    }
  }
Z
Zeng Jinle 已提交
918

919 920
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
921 922
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(),
                        true,
923 924
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
925
    }
926
  }
927

928 929 930 931
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
932
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
933
    if (is_stream_safe_cuda_allocator_used_) {
934 935 936
      CheckCUDAAllocThreadSafe(cuda_allocators_);
    }
#endif
937 938 939
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
940
    PADDLE_ENFORCE_GT(
941 942
        retry_time,
        0,
943 944
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
945 946 947 948 949 950 951
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

952 953
  void WrapStatAllocator() {
    for (auto& pair : allocators_) {
954 955 956 957 958 959 960
      // Now memory stats is only supported for CPU and GPU
      const platform::Place& place = pair.first;
      if (platform::is_cpu_place(place) ||
          platform::is_cuda_pinned_place(place) ||
          platform::is_gpu_place(place)) {
        pair.second = std::make_shared<StatAllocator>(pair.second);
      }
961 962 963
    }
  }

964 965
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // a standalone CUDA allocator to support multi-stream GC in new executor
966 967
  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
      default_stream_safe_cuda_allocators_;
968
  CUDAAllocatorMap cuda_allocators_;
969
  std::shared_timed_mutex cuda_allocator_mutex_;
970 971
#endif
  AllocatorStrategy strategy_;
972
  AllocatorMap allocators_;
973 974
  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
975
  bool allow_free_idle_chunk_;
976
  bool is_stream_safe_cuda_allocator_used_;
977
};
978 979 980 981
AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

Y
Refine  
Yu Yang 已提交
982
// Pimpl. Make interface clean.
983
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
984 985 986
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
987 988

AllocatorFacade& AllocatorFacade::Instance() {
989 990 991 992 993 994
  static AllocatorFacade* instance = new AllocatorFacade;
  return *instance;
}

AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
995
  if (UNLIKELY(IsCUDAGraphCapturing())) {
996
    auto id = phi::backends::gpu::CUDAGraph::CapturingPoolID();
997 998
    auto iter = cuda_graph_map_.find(id);
    PADDLE_ENFORCE_NE(
999 1000
        iter,
        cuda_graph_map_.end(),
1001 1002 1003 1004 1005 1006 1007
        platform::errors::PermissionDenied(
            "No memory pool is prepared for CUDA Graph capturing."));
    VLOG(10) << "Choose CUDA Graph memory pool";
    return iter->second.get();
  }
#endif
  return m_;
1008 1009
}

1010 1011
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
1012 1013
  return GetPrivate()->GetAllocator(
      place, /* A non-zero num to choose allocator_ */ 1);
1014 1015
}

1016
void* AllocatorFacade::GetBasePtr(
1017
    const std::shared_ptr<phi::Allocation>& allocation) {
1018 1019
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
                    AllocatorStrategy::kAutoGrowth,
1020 1021 1022 1023
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for auto_growth "
                        "strategy, not support allocator strategy: %d",
                        static_cast<int>(GetAllocatorStrategy())));
1024 1025
  PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()),
                    true,
1026 1027 1028 1029
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for CUDAPlace(), not "
                        "suppot place: %s",
                        allocation->place()));
1030
  return GetPrivate()->GetBasePtr(allocation);
1031 1032
}

1033 1034
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
    const platform::Place& place) {
1035
  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
1036 1037
}

1038
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
1039
    const platform::Place& place, size_t size) {
1040
  return std::shared_ptr<phi::Allocation>(Alloc(place, size));
1041 1042
}

1043 1044
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
1045
  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
1046 1047
}

W
Wilber 已提交
1048
uint64_t AllocatorFacade::Release(const platform::Place& place) {
1049 1050
  return GetPrivate()
      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
1051 1052 1053
      ->Release(place);
}

1054 1055
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const phi::Stream& stream) {
1056
  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
1057 1058
}

1059 1060
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size,
1061
                                     const phi::Stream& stream) {
1062
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1063 1064 1065 1066 1067
  AllocatorFacadePrivate* m = GetPrivate();
  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return Alloc(place, size);
  }
1068

1069 1070 1071
  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
1072
    return m->GetAllocator(p, s, /* create_if_not_found = */ true)
1073 1074
        ->Allocate(size);
  } else {
1075
    return m->GetAllocator(p, size)->Allocate(size);
1076
  }
1077
#elif defined(PADDLE_WITH_XPU)
1078
  return GetAllocator(place)->Allocate(size);
1079
#else
1080 1081
  PADDLE_THROW(platform::errors::PreconditionNotMet(
      "Not compiled with GPU or XPU or NPU."));
1082 1083 1084
#endif
}

1085 1086 1087
bool AllocatorFacade::InSameStream(
    const std::shared_ptr<phi::Allocation>& allocation,
    const phi::Stream& stream) {
1088
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1089 1090 1091 1092
  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
  return s == GetStream(allocation);
#else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
1093
#endif
1094 1095
}

1096 1097 1098 1099
bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
  return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
}

1100
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1101
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
1102
                                  gpuStream_t stream) {
1103 1104 1105 1106 1107 1108 1109
  AllocatorFacadePrivate* m = GetPrivate();
  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return Release(place);
  }

  return m->GetAllocator(place, stream)->Release(place);
1110 1111
}

1112
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
1113
                                   gpuStream_t stream) {
1114
  GetPrivate()->RecordStream(allocation, stream);
1115 1116
}

1117
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
1118
    const platform::Place& place, gpuStream_t stream) {
1119 1120 1121 1122 1123
  AllocatorFacadePrivate* m = GetPrivate();

  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return GetAllocator(place);
1124
  }
1125 1126

  if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
1127 1128
    return m->GetAllocator(place,
                           stream,
1129 1130 1131
                           /*create_if_not_found=*/true);
  }
  return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
1132 1133
}

1134
gpuStream_t AllocatorFacade::GetStream(
1135
    const std::shared_ptr<phi::Allocation>& allocation) const {
1136
  return GetPrivate()->GetStream(allocation);
1137 1138
}

1139
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
1140
                                       gpuStream_t stream) {
1141 1142
  if (m_->IsStreamSafeCUDAAllocatorUsed()) {
    m_->SetDefaultStream(place, stream);
1143 1144 1145
  }
}

1146
#ifdef PADDLE_WITH_CUDA
1147
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
1148 1149
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(),
                    AllocatorStrategy::kAutoGrowth,
1150 1151 1152 1153 1154 1155
                    platform::errors::InvalidArgument(
                        "CUDA Graph is only supported when the "
                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
                        "FLAGS_allocator_strategy=\"%s\"",
                        FLAGS_allocator_strategy));
  auto& allocator = cuda_graph_map_[id];
1156 1157 1158 1159 1160 1161 1162 1163 1164
  auto& ref_cnt = cuda_graph_ref_cnt_[id];
  if (allocator.get() == nullptr) {
    allocator.reset(
        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
    VLOG(10) << "Create memory pool for CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Use created memory pool for CUDA Graph with memory ID " << id;
  }
  ++ref_cnt;
1165 1166
}

1167 1168
void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
  auto ref_cnt_iter = cuda_graph_ref_cnt_.find(id);
1169 1170
  PADDLE_ENFORCE_NE(ref_cnt_iter,
                    cuda_graph_ref_cnt_.end(),
1171
                    platform::errors::InvalidArgument(
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
                        "Cannot find CUDA Graph with memory ID = %d", id));
  auto& ref_cnt = ref_cnt_iter->second;
  --ref_cnt;
  if (ref_cnt == 0) {
    cuda_graph_map_.erase(id);
    cuda_graph_ref_cnt_.erase(ref_cnt_iter);
    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
             << ref_cnt;
  }
1183 1184
}
#endif
1185
#endif
1186 1187 1188 1189

UNUSED static std::shared_ptr<NaiveBestFitAllocator> unused_obj =
    std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());

1190 1191 1192
}  // namespace allocation
}  // namespace memory
}  // namespace paddle