allocator_facade.cc 43.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "gflags/gflags.h"
18
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
19
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
20
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
21
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
22
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
23
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
S
sneaxiy 已提交
24
#include "paddle/fluid/memory/allocation/retry_allocator.h"
25
#include "paddle/fluid/memory/allocation/stat_allocator.h"
S
sneaxiy 已提交
26
#include "paddle/fluid/platform/enforce.h"
27
#include "paddle/fluid/platform/place.h"
28

29
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
30
#include <shared_mutex>
31
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
32
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
S
sneaxiy 已提交
33
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
34
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
35
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
36
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
37
#include "paddle/fluid/platform/device_context.h"
38
#include "paddle/phi/backends/gpu/gpu_context.h"
39 40

#ifdef PADDLE_WITH_CUDA
41
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
42
#endif
43

44 45 46 47 48
#if CUDA_VERSION >= 10020
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
49
#endif
50

51
#ifdef PADDLE_WITH_XPU
52
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
53
#endif
54 55 56 57

#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
58

J
jianghaicheng 已提交
59 60 61 62
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif

F
fwenguang 已提交
63 64 65 66
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif

67 68 69 70 71
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#endif

Z
Zeng Jinle 已提交
72
PADDLE_DEFINE_EXPORTED_int64(
73
    gpu_allocator_retry_time, 10000,
S
sneaxiy 已提交
74 75 76
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
77 78 79 80
PADDLE_DEFINE_EXPORTED_bool(
    use_system_allocator, false,
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
81

82 83 84
PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");

85 86 87
// NOTE(Ruibiao): This FLAGS is just to be compatibled with
// the old single-stream CUDA allocator. It will be removed
// after StreamSafeCudaAllocator has been fully tested.
88
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
89 90
                            "Enable StreamSafeCUDAAllocator");

91 92 93 94 95
PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,
                            "Whether to use CUDAManagedAllocator to allocate "
                            "managed memory, only available for auto_growth "
                            "strategy");

96 97
DECLARE_string(allocator_strategy);

98 99 100 101
namespace paddle {
namespace memory {
namespace allocation {

102 103 104 105 106 107 108 109
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
    : public Allocator,
      public std::enable_shared_from_this<CUDAGraphAllocator> {
 private:
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
110
                      DecoratedAllocationPtr underlying_allocation)
F
From00 已提交
111 112 113
        : Allocation(
              underlying_allocation->ptr(), underlying_allocation->base_ptr(),
              underlying_allocation->size(), underlying_allocation->place()),
114 115 116 117 118
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}

   private:
    std::shared_ptr<Allocator> allocator_;
119
    DecoratedAllocationPtr underlying_allocation_;
120 121 122 123 124 125
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
      : underlying_allocator_(allocator) {}

 public:
126 127
  ~CUDAGraphAllocator() { VLOG(10) << "CUDAGraphAllocator destructed"; }

128 129 130 131 132 133
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
  }

 protected:
134
  phi::Allocation* AllocateImpl(size_t size) {
135
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
136 137 138
    return new PrivateAllocation(this,
                                 static_unique_ptr_cast<Allocation>(
                                     underlying_allocator_->Allocate(size)));
139 140
  }

141
  void FreeImpl(phi::Allocation* allocation) {
142 143 144 145 146 147 148 149 150
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }

 private:
  std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

151 152 153 154 155 156 157 158
static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
  return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
#else
  return false;
#endif
}

Y
Yu Yang 已提交
159 160
class AllocatorFacadePrivate {
 public:
161 162
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

163 164 165 166 167 168
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  using CUDAAllocatorMap =
      std::map<platform::CUDAPlace,
               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
#endif

169 170
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
171 172
    is_stream_safe_cuda_allocator_used_ = false;

173
    switch (strategy_) {
174 175
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
J
jianghaicheng 已提交
176 177 178 179 180
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
181
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
182
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
183 184 185
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
186
#endif
187 188 189 190 191
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
192 193 194 195
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
196
        InitNaiveBestFitNPUPinnedAllocator();
F
fwenguang 已提交
197 198 199 200 201
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
202 203
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
204
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
205 206
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
207
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
208 209 210 211 212
               ++dev_id) {
            InitNaiveBestFitCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id));
          }
        }
213
#endif
Z
Zeng Jinle 已提交
214 215
        break;
      }
216 217 218

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
219 220
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        allow_free_idle_chunk_ = allow_free_idle_chunk;
221 222 223 224 225
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                      allow_free_idle_chunk_);
        }

226 227 228 229 230 231 232 233 234 235 236 237 238
        // Note(Ruibiao): For GPU multi-stream case without CUDA graph
        // capturing, the 'allocators_' map(place -> Allocator) hold the
        // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
        // directly got from DeviceContex), while the 'cuda_allocators_' map
        // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
        // releate to non-default stream (i.e., the stream users pass in). The
        // default stream Allocator is built in the structure of
        // AllocatorFacadePrivate, while the non-default stream is build in a
        // manner in GetAllocator function with 'create_if_not_found = ture'.
        // We make special treatment for the default stream for performance
        // reasons. Since most Alloc calls are for default stream in
        // application, treating it separately can avoid lots of overhead of
        // acquiring default stream and applying read-write lock.
239
        if (FLAGS_use_stream_safe_cuda_allocator) {
240 241 242 243
          if (LIKELY(!IsCUDAGraphCapturing())) {
            WrapStreamSafeCUDAAllocatorForDefault();
          }
          is_stream_safe_cuda_allocator_used_ = true;
244
        }
245

246 247
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
248 249 250 251 252 253
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
        InitNaiveBestFitNPUPinnedAllocator();
#endif
254 255 256 257
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
J
jianghaicheng 已提交
258 259 260 261 262
#endif
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
F
fwenguang 已提交
263 264 265 266 267
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
268 269
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
270
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
271 272
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
273
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
274 275 276 277 278
               ++dev_id) {
            InitAutoGrowthCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
          }
        }
279
#endif
Z
Zeng Jinle 已提交
280 281
        break;
      }
282

283 284
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
285 286 287 288 289
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
J
jianghaicheng 已提交
290 291 292 293 294
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
295
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
296
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
297 298 299
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
F
fwenguang 已提交
300 301 302 303 304
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
305 306 307 308
#endif
        break;
      }

Z
Zeng Jinle 已提交
309
      default: {
310
        PADDLE_THROW(platform::errors::InvalidArgument(
311
            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
Z
Zeng Jinle 已提交
312
      }
Y
Yu Yang 已提交
313
    }
Z
Zeng Jinle 已提交
314
    InitZeroSizeAllocators();
315
    InitSystemAllocators();
316 317 318 319 320

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

321 322
    WrapStatAllocator();

323
    CheckAllocThreadSafe();
324 325

#ifdef PADDLE_WITH_CUDA
326 327 328
    // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
    if (!is_stream_safe_cuda_allocator_used_ &&
        UNLIKELY(IsCUDAGraphCapturing())) {
329 330 331
      WrapCUDAGraphAllocator();
    }
#endif
Z
Zeng Jinle 已提交
332 333 334 335
  }

  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
336
    VLOG(6) << "GetAllocator"
L
Leo Chen 已提交
337
            << " " << place << " " << size;
338 339
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
340
                                                          : GetAllocatorMap())
341
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
342
    auto iter = allocators.find(place);
343 344 345
    PADDLE_ENFORCE_NE(iter, allocators.end(),
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
346
    return iter->second;
347 348
  }

349
  void* GetBasePtr(const std::shared_ptr<phi::Allocation>& allocation) {
350 351 352
    return static_cast<Allocation*>(allocation.get())->base_ptr();
  }

353 354 355 356 357
  bool IsStreamSafeCUDAAllocatorUsed() {
    return is_stream_safe_cuda_allocator_used_ &&
           LIKELY(FLAGS_use_system_allocator == false);
  }

358
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
359
  bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
360 361 362 363 364 365 366 367 368
    auto it = cuda_allocators_.find(place);
    if (it == cuda_allocators_.end()) {
      return false;
    }
    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
        it->second;
    return allocator_map.find(stream) != allocator_map.end();
  }

369
  const std::shared_ptr<Allocator>& GetAllocator(
370
      const platform::CUDAPlace& place, gpuStream_t stream,
371
      bool create_if_not_found = false) {
372 373 374 375 376
    if (LIKELY(!IsCUDAGraphCapturing())) {
      if (stream == GetDefaultStream(place)) {
        VLOG(7) << "Get Allocator by passing in a default stream";
        return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
      }
377 378 379
    }

    /* shared_lock_guard */ {
380 381 382
      std::shared_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      if (LIKELY(HasCUDAAllocator(place, stream))) {
383 384
        return cuda_allocators_[place][stream];
      } else {
385 386 387 388 389
        PADDLE_ENFORCE_NE(create_if_not_found, false,
                          platform::errors::NotFound(
                              "No allocator found for stream %s in place %s "
                              "with create_if_not_found = false",
                              stream, place));
390 391 392
      }
    }

393
    /* unique_lock_guard */ {
394 395 396 397
      std::unique_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      InitStreamSafeCUDAAllocator(place, stream);
      return cuda_allocators_[place][stream];
398
    }
399 400
  }

401 402 403 404 405 406 407 408 409 410
  const std::shared_ptr<StreamSafeCUDAAllocator>
  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
    const auto iter = default_stream_safe_cuda_allocators_.find(place);
    PADDLE_ENFORCE_NE(
        iter, default_stream_safe_cuda_allocators_.end(),
        platform::errors::NotFound(
            "No StreamSafeCUDAAllocator found for the place, %s", place));
    return iter->second;
  }

411
  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) const {
412 413 414 415 416
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
    return allocator->GetDefaultStream();
  }

417
  void SetDefaultStream(const platform::CUDAPlace& place, gpuStream_t stream) {
418 419
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436

    // NOTE(Ruibiao): The default stream will be set when the CUDADeviceContext
    // created. Normally, the DeviceContextPool is a global singleton and one
    // Place only correspond to one DeviceContext. However, to support
    // multi-stream scheduling, standalone executor creates two extra
    // DeviceContextPools for H2D and D2H stream in StreamAnalyzer, which make
    // one Place correspond to multiple DeviceContext and unexpectedly reset the
    // default stream in runtime. To avoid this behavior, we do not allow
    // changing default stream after initially setting.
    if (allocator->GetDefaultStream() != nullptr) {
      VLOG(5) << "The default stream for StreamSafeCUDAAllocator("
              << allocator.get() << ") in " << place << " has been set to "
              << allocator->GetDefaultStream()
              << " before, not allow to change now.";
      return;
    }

437 438 439 440 441 442
    allocator->SetDefaultStream(stream);
    VLOG(8) << "Set default stream to " << stream
            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
            << place;
  }

443
  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
444
                    gpuStream_t stream) {
445 446 447 448 449 450
    std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
        std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
    if (stream_safe_cuda_allocation != nullptr) {
      stream_safe_cuda_allocation->RecordStream(stream);
    } else {
      VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
451
    }
452 453
  }

454
  gpuStream_t GetStream(
455
      const std::shared_ptr<phi::Allocation>& allocation) const {
456 457 458 459 460 461 462 463 464 465 466
    const std::shared_ptr<StreamSafeCUDAAllocation>
        stream_safe_cuda_allocation =
            std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
    if (stream_safe_cuda_allocation != nullptr) {
      return stream_safe_cuda_allocation->GetOwningStream();
    }

    VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
    return static_cast<phi::GPUContext*>(
               platform::DeviceContextPool::Instance().Get(allocation->place()))
        ->stream();
467 468 469 470 471 472 473 474 475 476
  }
#endif

 private:
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
    bool IsAllocThreadSafe() const override { return true; }

   protected:
477
    phi::Allocation* AllocateImpl(size_t size) override {
478 479
      return new Allocation(nullptr, 0, place_);
    }
480
    void FreeImpl(phi::Allocation* allocation) override { delete allocation; }
481 482 483 484 485

   private:
    platform::Place place_;
  };

486
  const AllocatorMap& GetAllocatorMap() { return allocators_; }
487

488 489 490
  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
Y
Yu Yang 已提交
491 492
  }

493
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
494 495 496
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
497 498
  }

499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  // Create a new CUDAAllocator or CUDAManagedAllocator for the given device
  std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
    if (FLAGS_use_cuda_managed_memory) {
      PADDLE_ENFORCE_EQ(
          strategy_, AllocatorStrategy::kAutoGrowth,
          platform::errors::InvalidArgument(
              "CUDA managed memory is only implemented for auto_growth "
              "strategy, not support %s strategy.\n"
              "Please use auto_growth strategy by command `export "
              "FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
              "memory by command `export FLAGS_use_cuda_managed_memory=false`",
              FLAGS_allocator_strategy));

      if (!platform::IsGPUManagedMemorySupported(p.device)) {
        PADDLE_THROW(platform::errors::Unavailable(
            "Failed to create CUDAManagedAllocator on GPU %d.\n\n"
            "You have enabled CUDA managed memory, but the gpu device does not "
            "support allocating managed memory.\n"
            "If you don't actually need to use managed memory, please disable "
            "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
523 524
            "Or you must use the gpu device that supports managed memory.",
            p.device));
525 526 527 528 529 530
      }
      return std::make_shared<CUDAManagedAllocator>(p);
    }
    return std::make_shared<CUDAAllocator>(p);
  }

531 532 533 534 535 536 537
  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    PADDLE_ENFORCE_EQ(
        strategy_, AllocatorStrategy::kAutoGrowth,
        platform::errors::Unimplemented(
            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
            "the allocator strategy %d is unsupported for multi-stream",
            static_cast<int>(strategy_)));
538 539 540
    if (LIKELY(!HasCUDAAllocator(p, stream))) {
      VLOG(8) << "Init CUDA allocator for stream " << stream << " in place "
              << p;
541 542 543
      InitAutoGrowthCUDAAllocator(p, stream);
      WrapStreamSafeCUDAAllocator(p, stream);
      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
544
      WrapStatAllocator(p, stream);
545 546 547 548 549
    }
  }

  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
#if defined(PADDLE_WITH_HIP)
550
    auto cuda_allocator = CreateCUDAAllocator(p);
551
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
552
        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
553 554 555 556 557 558 559
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
560
      PADDLE_ENFORCE_GPU_SUCCESS(
561 562
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

563
      PADDLE_ENFORCE_GPU_SUCCESS(
564 565 566 567 568 569 570 571 572 573 574 575 576
          paddle::platform::dynload::cuDeviceGetAttribute(
              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      cuda_allocators_[p][stream] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
577
      auto cuda_allocator = CreateCUDAAllocator(p);
578 579 580 581 582 583
      cuda_allocators_[p][stream] =
          std::make_shared<AutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(),
              allow_free_idle_chunk_);
    }
#else
584
    auto cuda_allocator = CreateCUDAAllocator(p);
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }

    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
        underlying_allocator, alignment, 0, allow_free_idle_chunk_);
#endif
#endif
621 622
  }

623
  // NOTE(Ruibiao): Old single-stream version, will be removed later
624 625
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
626
#if defined(PADDLE_WITH_HIP)
627
    auto cuda_allocator = CreateCUDAAllocator(p);
628 629 630 631 632 633 634 635 636
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
637
      PADDLE_ENFORCE_GPU_SUCCESS(
638 639
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

640
      PADDLE_ENFORCE_GPU_SUCCESS(
641 642 643 644 645 646 647 648 649 650 651 652 653
          paddle::platform::dynload::cuDeviceGetAttribute(
              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      allocators_[p] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
654
      auto cuda_allocator = CreateCUDAAllocator(p);
655 656 657 658 659
      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
    }

#else
660
    auto cuda_allocator = CreateCUDAAllocator(p);
L
Leo Chen 已提交
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }
692
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
L
Leo Chen 已提交
693
        underlying_allocator, alignment, 0, allow_free_idle_chunk);
694 695
#endif
#endif
S
sneaxiy 已提交
696
  }
697 698 699 700 701 702

  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
703 704 705 706
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StreamSafeCUDAAllocator>(
        allocator, p, stream,
        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
707 708
  }

709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
  void WrapStreamSafeCUDAAllocatorForDefault() {
    for (auto& pair : allocators_) {
      auto& place = pair.first;
      if (platform::is_gpu_place(place)) {
        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
            std::make_shared<StreamSafeCUDAAllocator>(
                pair.second, place, /* default_stream = */ nullptr,
                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
        pair.second = allocator;

        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
        // ability to interact with the outside world, i.e., change default
        // stream from outside
        default_stream_safe_cuda_allocators_[place] = allocator;
        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
                << ", allocator address = " << pair.second.get();
      }
    }
  }

729 730 731 732 733 734
  void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                              size_t retry_time) {
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
735
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
736 737 738
    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
  }

739 740 741 742 743
  void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StatAllocator>(allocator);
  }

744 745 746 747 748 749 750 751 752
#ifdef PADDLE_WITH_CUDA
  void WrapCUDAGraphAllocator() {
    for (auto& item : allocators_) {
      auto& allocator = item.second;
      allocator = CUDAGraphAllocator::Create(allocator);
    }
  }
#endif

753 754 755 756 757 758 759 760 761
  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
    for (auto& place_pair : allocators) {
      for (auto& stream_pair : place_pair.second) {
        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true,
                          platform::errors::InvalidArgument(
                              "Public allocators must be thread safe"));
      }
    }
  }
762
#endif
S
sneaxiy 已提交
763

764 765 766 767 768 769
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

J
jianghaicheng 已提交
770 771 772 773 774 775
#ifdef PADDLE_WITH_IPU
  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

F
fwenguang 已提交
776 777 778 779 780 781
#ifdef PADDLE_WITH_MLU
  void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

782 783 784 785
#ifdef PADDLE_WITH_ASCEND_CL
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
786 787 788 789 790

  void InitNaiveBestFitNPUPinnedAllocator() {
    allocators_[platform::NPUPinnedPlace()] =
        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
  }
791 792
#endif

793 794 795 796 797 798 799 800 801 802
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
                                           bool allow_free_idle_chunk) {
    auto custom_allocator =
        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
803
        custom_allocator, phi::DeviceManager::GetMinChunkSize(p),
804 805 806 807
        allow_free_idle_chunk);
  }
#endif

808 809 810 811 812 813 814 815
  void InitSystemAllocators() {
    if (!system_allocators_.empty()) return;
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
Z
Zeng Jinle 已提交
816
    }
817
#endif
J
jianghaicheng 已提交
818 819 820 821 822 823 824
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::IPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
825 826 827
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
828
    int device_count = platform::GetGPUDeviceCount();
829 830
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
831
      system_allocators_[p] = CreateCUDAAllocator(p);
832
    }
F
fwenguang 已提交
833 834 835 836
#endif
#ifdef PADDLE_WITH_MLU
    int device_count = platform::GetMLUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
837
      platform::MLUPlace p(i);
F
fwenguang 已提交
838 839
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
840 841 842 843 844 845 846 847 848 849
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
        platform::CustomPlace p(dev_type, dev_id);
        system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
      }
    }
850 851
#endif
  }
Z
Zeng Jinle 已提交
852 853

  void InitZeroSizeAllocators() {
854
    if (!zero_size_allocators_.empty()) return;
Z
Zeng Jinle 已提交
855 856
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
857
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
858
    int device_count = platform::GetGPUDeviceCount();
Z
Zeng Jinle 已提交
859 860 861 862 863
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
864 865 866 867 868 869
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
870 871 872 873 874 875
#ifdef PADDLE_WITH_ASCEND_CL
    int device_count = platform::GetNPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::NPUPlace(dev_id));
    }
#endif
J
jianghaicheng 已提交
876 877 878 879 880 881
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::IPUPlace(dev_id));
    }
#endif
F
fwenguang 已提交
882 883 884 885 886 887
#ifdef PADDLE_WITH_MLU
    int device_count = platform::GetMLUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::MLUPlace(dev_id));
    }
#endif
888
#ifdef PADDLE_WITH_CUSTOM_DEVICE
889
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
890 891
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
892
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
893 894 895 896
        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
      }
    }
#endif
Z
Zeng Jinle 已提交
897 898 899

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
900 901
    }
  }
Z
Zeng Jinle 已提交
902

903 904 905 906 907
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true,
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
908
    }
909
  }
910

911 912 913 914
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
915
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
916
    if (is_stream_safe_cuda_allocator_used_) {
917 918 919
      CheckCUDAAllocThreadSafe(cuda_allocators_);
    }
#endif
920 921 922
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
923 924 925 926
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
927 928 929 930 931 932 933
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

934 935
  void WrapStatAllocator() {
    for (auto& pair : allocators_) {
936 937 938 939 940 941 942
      // Now memory stats is only supported for CPU and GPU
      const platform::Place& place = pair.first;
      if (platform::is_cpu_place(place) ||
          platform::is_cuda_pinned_place(place) ||
          platform::is_gpu_place(place)) {
        pair.second = std::make_shared<StatAllocator>(pair.second);
      }
943 944 945
    }
  }

946 947
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // a standalone CUDA allocator to support multi-stream GC in new executor
948 949
  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
      default_stream_safe_cuda_allocators_;
950
  CUDAAllocatorMap cuda_allocators_;
951
  std::shared_timed_mutex cuda_allocator_mutex_;
952 953
#endif
  AllocatorStrategy strategy_;
954
  AllocatorMap allocators_;
955 956
  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
957
  bool allow_free_idle_chunk_;
958
  bool is_stream_safe_cuda_allocator_used_;
959
};
960 961 962 963
AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

Y
Refine  
Yu Yang 已提交
964
// Pimpl. Make interface clean.
965
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
966 967 968
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
969 970

AllocatorFacade& AllocatorFacade::Instance() {
971 972 973 974 975 976
  static AllocatorFacade* instance = new AllocatorFacade;
  return *instance;
}

AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
977
  if (UNLIKELY(IsCUDAGraphCapturing())) {
978
    auto id = platform::CUDAGraph::CapturingPoolID();
979 980 981 982 983 984 985 986 987 988
    auto iter = cuda_graph_map_.find(id);
    PADDLE_ENFORCE_NE(
        iter, cuda_graph_map_.end(),
        platform::errors::PermissionDenied(
            "No memory pool is prepared for CUDA Graph capturing."));
    VLOG(10) << "Choose CUDA Graph memory pool";
    return iter->second.get();
  }
#endif
  return m_;
989 990
}

991 992
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
993 994
  return GetPrivate()->GetAllocator(
      place, /* A non-zero num to choose allocator_ */ 1);
995 996
}

997
void* AllocatorFacade::GetBasePtr(
998
    const std::shared_ptr<phi::Allocation>& allocation) {
999 1000 1001 1002 1003 1004 1005 1006 1007 1008
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for auto_growth "
                        "strategy, not support allocator strategy: %d",
                        static_cast<int>(GetAllocatorStrategy())));
  PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true,
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for CUDAPlace(), not "
                        "suppot place: %s",
                        allocation->place()));
1009
  return GetPrivate()->GetBasePtr(allocation);
1010 1011
}

1012 1013
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
    const platform::Place& place) {
1014
  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
1015 1016
}

1017
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
1018
    const platform::Place& place, size_t size) {
1019
  return std::shared_ptr<phi::Allocation>(Alloc(place, size));
1020 1021
}

1022 1023
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
1024
  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
1025 1026
}

W
Wilber 已提交
1027
uint64_t AllocatorFacade::Release(const platform::Place& place) {
1028 1029
  return GetPrivate()
      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
1030 1031 1032
      ->Release(place);
}

1033 1034
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const phi::Stream& stream) {
1035
  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
1036 1037
}

1038 1039
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                     const phi::Stream& stream) {
1040
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1041 1042 1043 1044 1045
  AllocatorFacadePrivate* m = GetPrivate();
  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return Alloc(place, size);
  }
1046

1047 1048 1049
  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
1050
    return m->GetAllocator(p, s, /* create_if_not_found = */ true)
1051 1052
        ->Allocate(size);
  } else {
1053
    return m->GetAllocator(p, size)->Allocate(size);
1054 1055 1056 1057 1058 1059
  }
#else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
}

1060 1061 1062
bool AllocatorFacade::InSameStream(
    const std::shared_ptr<phi::Allocation>& allocation,
    const phi::Stream& stream) {
1063
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1064 1065 1066 1067
  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
  return s == GetStream(allocation);
#else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
1068
#endif
1069 1070
}

1071 1072 1073 1074
bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
  return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
}

1075
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1076
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
1077
                                  gpuStream_t stream) {
1078 1079 1080 1081 1082 1083 1084
  AllocatorFacadePrivate* m = GetPrivate();
  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return Release(place);
  }

  return m->GetAllocator(place, stream)->Release(place);
1085 1086
}

1087
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
1088
                                   gpuStream_t stream) {
1089
  GetPrivate()->RecordStream(allocation, stream);
1090 1091
}

1092
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
1093
    const platform::Place& place, gpuStream_t stream) {
1094 1095 1096 1097 1098
  AllocatorFacadePrivate* m = GetPrivate();

  if (!m->IsStreamSafeCUDAAllocatorUsed()) {
    VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
    return GetAllocator(place);
1099
  }
1100 1101 1102 1103 1104 1105

  if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
    return m->GetAllocator(place, stream,
                           /*create_if_not_found=*/true);
  }
  return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
1106 1107
}

1108
gpuStream_t AllocatorFacade::GetStream(
1109
    const std::shared_ptr<phi::Allocation>& allocation) const {
1110
  return GetPrivate()->GetStream(allocation);
1111 1112
}

1113
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
1114
                                       gpuStream_t stream) {
1115 1116
  if (m_->IsStreamSafeCUDAAllocatorUsed()) {
    m_->SetDefaultStream(place, stream);
1117 1118 1119
  }
}

1120
#ifdef PADDLE_WITH_CUDA
1121
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(int64_t id) {
1122 1123 1124 1125 1126 1127 1128
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
                    platform::errors::InvalidArgument(
                        "CUDA Graph is only supported when the "
                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
                        "FLAGS_allocator_strategy=\"%s\"",
                        FLAGS_allocator_strategy));
  auto& allocator = cuda_graph_map_[id];
1129 1130 1131 1132 1133 1134 1135 1136 1137
  auto& ref_cnt = cuda_graph_ref_cnt_[id];
  if (allocator.get() == nullptr) {
    allocator.reset(
        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
    VLOG(10) << "Create memory pool for CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Use created memory pool for CUDA Graph with memory ID " << id;
  }
  ++ref_cnt;
1138 1139
}

1140 1141 1142
void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
  auto ref_cnt_iter = cuda_graph_ref_cnt_.find(id);
  PADDLE_ENFORCE_NE(ref_cnt_iter, cuda_graph_ref_cnt_.end(),
1143
                    platform::errors::InvalidArgument(
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
                        "Cannot find CUDA Graph with memory ID = %d", id));
  auto& ref_cnt = ref_cnt_iter->second;
  --ref_cnt;
  if (ref_cnt == 0) {
    cuda_graph_map_.erase(id);
    cuda_graph_ref_cnt_.erase(ref_cnt_iter);
    VLOG(10) << "Remove memory pool of CUDA Graph with memory ID " << id;
  } else {
    VLOG(10) << "Decrease memory pool ID " << id << " reference count to be "
             << ref_cnt;
  }
1155 1156
}
#endif
1157
#endif
1158 1159 1160
}  // namespace allocation
}  // namespace memory
}  // namespace paddle