allocator_facade.cc 43.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "gflags/gflags.h"
18
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
19
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
20
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
21
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
22
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
23
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
S
sneaxiy 已提交
24
#include "paddle/fluid/memory/allocation/retry_allocator.h"
S
sneaxiy 已提交
25
#include "paddle/fluid/platform/enforce.h"
26
#include "paddle/fluid/platform/place.h"
27

28
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
29
#include <shared_mutex>
30
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
31
#include "paddle/fluid/memory/allocation/cuda_managed_allocator.h"
S
sneaxiy 已提交
32
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
33
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
34
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
35
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
36
#include "paddle/fluid/platform/device_context.h"
37
#include "paddle/phi/backends/gpu/gpu_context.h"
38 39

#ifdef PADDLE_WITH_CUDA
40
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
41
#endif
42

43 44 45 46 47
#if CUDA_VERSION >= 10020
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
48
#endif
49

50
#ifdef PADDLE_WITH_XPU
51
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
52
#endif
53 54 55 56

#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
57

J
jianghaicheng 已提交
58 59 60 61
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif

F
fwenguang 已提交
62 63 64 65
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif

66 67 68 69 70
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#endif

Z
Zeng Jinle 已提交
71
PADDLE_DEFINE_EXPORTED_int64(
72
    gpu_allocator_retry_time, 10000,
S
sneaxiy 已提交
73 74 75
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
76 77 78 79
PADDLE_DEFINE_EXPORTED_bool(
    use_system_allocator, false,
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
80

81 82 83
PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");

84 85 86
// NOTE(Ruibiao): This FLAGS is just to be compatibled with
// the old single-stream CUDA allocator. It will be removed
// after StreamSafeCudaAllocator has been fully tested.
87
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
88 89
                            "Enable StreamSafeCUDAAllocator");

90 91 92 93 94
PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, false,
                            "Whether to use CUDAManagedAllocator to allocate "
                            "managed memory, only available for auto_growth "
                            "strategy");

95 96
DECLARE_string(allocator_strategy);

97 98 99 100
namespace paddle {
namespace memory {
namespace allocation {

101 102 103 104 105 106 107 108
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
    : public Allocator,
      public std::enable_shared_from_this<CUDAGraphAllocator> {
 private:
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
109
                      DecoratedAllocationPtr underlying_allocation)
F
From00 已提交
110 111 112
        : Allocation(
              underlying_allocation->ptr(), underlying_allocation->base_ptr(),
              underlying_allocation->size(), underlying_allocation->place()),
113 114 115 116 117
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}

   private:
    std::shared_ptr<Allocator> allocator_;
118
    DecoratedAllocationPtr underlying_allocation_;
119 120 121 122 123 124 125 126 127 128 129 130
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
      : underlying_allocator_(allocator) {}

 public:
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
  }

 protected:
131
  phi::Allocation* AllocateImpl(size_t size) {
132
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
133 134 135
    return new PrivateAllocation(this,
                                 static_unique_ptr_cast<Allocation>(
                                     underlying_allocator_->Allocate(size)));
136 137
  }

138
  void FreeImpl(phi::Allocation* allocation) {
139 140 141 142 143 144 145 146 147
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }

 private:
  std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

Y
Yu Yang 已提交
148 149
class AllocatorFacadePrivate {
 public:
150 151
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

152 153 154 155 156 157
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  using CUDAAllocatorMap =
      std::map<platform::CUDAPlace,
               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
#endif

158 159 160
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
    switch (strategy_) {
161 162
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
J
jianghaicheng 已提交
163 164 165 166 167
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
168
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
169 170 171 172 173 174
        PADDLE_ENFORCE_EQ(
            FLAGS_use_stream_safe_cuda_allocator, false,
            paddle::platform::errors::Unimplemented(
                "StreamSafeCUDAAllocator is only implemented for auto_growth "
                "strategy, not support naive_best_fit strategy"));

175
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
176 177 178
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
179
#endif
180 181 182 183 184
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
185 186 187 188
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
189
        InitNaiveBestFitNPUPinnedAllocator();
F
fwenguang 已提交
190 191 192 193 194
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
195 196
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
197
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
198 199
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
200
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
201 202 203 204 205
               ++dev_id) {
            InitNaiveBestFitCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id));
          }
        }
206
#endif
Z
Zeng Jinle 已提交
207 208
        break;
      }
209 210 211

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
212 213
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        allow_free_idle_chunk_ = allow_free_idle_chunk;
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                      allow_free_idle_chunk_);
        }

        // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
        // -> Allocator) hold the StreamSafeCUDAAllocator releate to default
        // stream (i.e., the stream directly got from DeviceContex), while the
        // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
        // StreamSafeCUDAAllocator releate to non-default stream (i.e., the
        // stream users pass in). The default stream Allocator is built in the
        // structure of AllocatorFacadePrivate, while the non-default stream is
        // build in a delayed manner in GetAllocator function with
        // 'create_if_not_found = ture'. We make special treatment for the
        // default stream for performance reasons. Since most Alloc calls are
        // for default stream in application, treating it separately can avoid
        // lots of overhead of acquiring default stream and applying read-write
        // lock.
        if (FLAGS_use_stream_safe_cuda_allocator) {
          WrapStreamSafeCUDAAllocatorForDefault();
234
        }
235

236 237
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
238 239 240 241 242 243
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
        InitNaiveBestFitNPUPinnedAllocator();
#endif
244 245 246 247
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
J
jianghaicheng 已提交
248 249 250 251 252
#endif
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
F
fwenguang 已提交
253 254 255 256 257
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
258 259
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
260
        auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
261 262
        for (const auto& dev_type : device_types) {
          for (size_t dev_id = 0;
263
               dev_id < phi::DeviceManager::GetDeviceCount(dev_type);
264 265 266 267 268
               ++dev_id) {
            InitAutoGrowthCustomDeviceAllocator(
                platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk);
          }
        }
269
#endif
Z
Zeng Jinle 已提交
270 271
        break;
      }
272

273 274
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
275 276 277 278 279
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
J
jianghaicheng 已提交
280 281 282 283 284
#ifdef PADDLE_WITH_IPU
        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
        }
#endif
285
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
286 287 288 289 290
        PADDLE_ENFORCE_EQ(
            FLAGS_use_stream_safe_cuda_allocator, false,
            paddle::platform::errors::Unimplemented(
                "StreamSafeCUDAAllocator is only implemented for auto_growth "
                "strategy, not support thread_local strategy"));
291

292
        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
293 294 295
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
F
fwenguang 已提交
296 297 298 299 300
#endif
#ifdef PADDLE_WITH_MLU
        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
        }
301 302 303 304
#endif
        break;
      }

Z
Zeng Jinle 已提交
305
      default: {
306
        PADDLE_THROW(platform::errors::InvalidArgument(
307
            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
Z
Zeng Jinle 已提交
308
      }
Y
Yu Yang 已提交
309
    }
Z
Zeng Jinle 已提交
310
    InitZeroSizeAllocators();
311
    InitSystemAllocators();
312 313 314 315 316 317

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

    CheckAllocThreadSafe();
318 319

#ifdef PADDLE_WITH_CUDA
320 321
    if (FLAGS_use_stream_safe_cuda_allocator == false &&
        UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
322 323 324
      WrapCUDAGraphAllocator();
    }
#endif
Z
Zeng Jinle 已提交
325 326 327 328
  }

  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
329
    VLOG(6) << "GetAllocator"
L
Leo Chen 已提交
330
            << " " << place << " " << size;
331 332
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
333
                                                          : GetAllocatorMap())
334
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
335
    auto iter = allocators.find(place);
336 337 338
    PADDLE_ENFORCE_NE(iter, allocators.end(),
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
339
    return iter->second;
340 341
  }

342
  void* GetBasePtr(const std::shared_ptr<phi::Allocation>& allocation) {
343 344 345
    return static_cast<Allocation*>(allocation.get())->base_ptr();
  }

346
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
347 348 349 350 351 352 353 354 355 356 357
  bool HasCUDAAllocator(const platform::CUDAPlace& place,
                        const gpuStream_t& stream) {
    auto it = cuda_allocators_.find(place);
    if (it == cuda_allocators_.end()) {
      return false;
    }
    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
        it->second;
    return allocator_map.find(stream) != allocator_map.end();
  }

358 359 360
  const std::shared_ptr<Allocator>& GetAllocator(
      const platform::CUDAPlace& place, const gpuStream_t& stream,
      bool create_if_not_found = false) {
361 362 363 364 365 366
    if (stream == GetDefaultStream(place)) {
      VLOG(7) << "Get Allocator by passing in a default stream";
      return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
    }

    /* shared_lock_guard */ {
367 368 369
      std::shared_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      if (LIKELY(HasCUDAAllocator(place, stream))) {
370 371
        return cuda_allocators_[place][stream];
      } else {
372 373 374 375 376
        PADDLE_ENFORCE_NE(create_if_not_found, false,
                          platform::errors::NotFound(
                              "No allocator found for stream %s in place %s "
                              "with create_if_not_found = false",
                              stream, place));
377 378 379
      }
    }

380
    /* unique_lock_guard */ {
381 382 383 384
      std::unique_lock<std::shared_timed_mutex> lock_guard(
          cuda_allocator_mutex_);
      InitStreamSafeCUDAAllocator(place, stream);
      return cuda_allocators_[place][stream];
385
    }
386 387
  }

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
  const std::shared_ptr<StreamSafeCUDAAllocator>
  GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
    const auto iter = default_stream_safe_cuda_allocators_.find(place);
    PADDLE_ENFORCE_NE(
        iter, default_stream_safe_cuda_allocators_.end(),
        platform::errors::NotFound(
            "No StreamSafeCUDAAllocator found for the place, %s", place));
    return iter->second;
  }

  const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
    return allocator->GetDefaultStream();
  }

  void SetDefaultStream(const platform::CUDAPlace& place,
                        const gpuStream_t& stream) {
    const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
        GetDefaultStreamSafeCUDAAllocator(place);
    allocator->SetDefaultStream(stream);
    VLOG(8) << "Set default stream to " << stream
            << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
            << place;
  }

  void SetDefaultStreamFromDeviceContext() {
    VLOG(8) << "Set default stream from DeviceContex";
    for (auto& pair : default_stream_safe_cuda_allocators_) {
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
      pair.second->SetDefaultStream(
          static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
    }
422
  }
423

424
  void RecordStream(std::shared_ptr<phi::Allocation> allocation,
425
                    const gpuStream_t& stream) {
426 427 428 429
    if (allocation->size() == 0) {
      return;
    }

430 431 432 433 434 435 436 437 438 439 440
    StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
        dynamic_cast<StreamSafeCUDAAllocation*>(allocation.get());
    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
                            platform::errors::InvalidArgument(
                                "Failed to dynamic cast %p from Allocation* to "
                                "StreamSafeCUDAAllocation*",
                                allocation.get()));
    stream_safe_cuda_allocation->RecordStream(stream);
  }

  const gpuStream_t& GetStream(
441
      const std::shared_ptr<phi::Allocation>& allocation) const {
442 443 444 445 446 447 448 449
    const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
        dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
                            platform::errors::InvalidArgument(
                                "Failed to dynamic cast %p from Allocation* to "
                                "StreamSafeCUDAAllocation*",
                                allocation.get()));
    return stream_safe_cuda_allocation->GetOwningStream();
450 451 452 453 454 455 456 457 458 459
  }
#endif

 private:
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
    bool IsAllocThreadSafe() const override { return true; }

   protected:
460
    phi::Allocation* AllocateImpl(size_t size) override {
461 462
      return new Allocation(nullptr, 0, place_);
    }
463
    void FreeImpl(phi::Allocation* allocation) override { delete allocation; }
464 465 466 467 468

   private:
    platform::Place place_;
  };

469
  const AllocatorMap& GetAllocatorMap() { return allocators_; }
470

471 472 473
  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
Y
Yu Yang 已提交
474 475
  }

476
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
477 478 479
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
480 481
  }

482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  // Create a new CUDAAllocator or CUDAManagedAllocator for the given device
  std::shared_ptr<Allocator> CreateCUDAAllocator(platform::CUDAPlace p) {
    if (FLAGS_use_cuda_managed_memory) {
      PADDLE_ENFORCE_EQ(
          strategy_, AllocatorStrategy::kAutoGrowth,
          platform::errors::InvalidArgument(
              "CUDA managed memory is only implemented for auto_growth "
              "strategy, not support %s strategy.\n"
              "Please use auto_growth strategy by command `export "
              "FLAGS_allocator_strategy=\"auto_growth\"`, or disable managed "
              "memory by command `export FLAGS_use_cuda_managed_memory=false`",
              FLAGS_allocator_strategy));

      if (!platform::IsGPUManagedMemorySupported(p.device)) {
        PADDLE_THROW(platform::errors::Unavailable(
            "Failed to create CUDAManagedAllocator on GPU %d.\n\n"
            "You have enabled CUDA managed memory, but the gpu device does not "
            "support allocating managed memory.\n"
            "If you don't actually need to use managed memory, please disable "
            "it with command `export FLAGS_use_cuda_managed_memory=false`.\n"
506 507
            "Or you must use the gpu device that supports managed memory.",
            p.device));
508 509 510 511 512 513
      }
      return std::make_shared<CUDAManagedAllocator>(p);
    }
    return std::make_shared<CUDAAllocator>(p);
  }

514 515 516 517 518 519 520
  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
    PADDLE_ENFORCE_EQ(
        strategy_, AllocatorStrategy::kAutoGrowth,
        platform::errors::Unimplemented(
            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
            "the allocator strategy %d is unsupported for multi-stream",
            static_cast<int>(strategy_)));
521 522 523
    if (LIKELY(!HasCUDAAllocator(p, stream))) {
      VLOG(8) << "Init CUDA allocator for stream " << stream << " in place "
              << p;
524 525 526 527 528 529 530 531
      InitAutoGrowthCUDAAllocator(p, stream);
      WrapStreamSafeCUDAAllocator(p, stream);
      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
    }
  }

  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
#if defined(PADDLE_WITH_HIP)
532
    auto cuda_allocator = CreateCUDAAllocator(p);
533
    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
534
        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
535 536 537 538 539 540 541
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
542
      PADDLE_ENFORCE_GPU_SUCCESS(
543 544
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

545
      PADDLE_ENFORCE_GPU_SUCCESS(
546 547 548 549 550 551 552 553 554 555 556 557 558
          paddle::platform::dynload::cuDeviceGetAttribute(
              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      cuda_allocators_[p][stream] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
559
      auto cuda_allocator = CreateCUDAAllocator(p);
560 561 562 563 564 565
      cuda_allocators_[p][stream] =
          std::make_shared<AutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(),
              allow_free_idle_chunk_);
    }
#else
566
    auto cuda_allocator = CreateCUDAAllocator(p);
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }

    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
        underlying_allocator, alignment, 0, allow_free_idle_chunk_);
#endif
#endif
603 604
  }

605
  // NOTE(Ruibiao): Old single-stream version, will be removed later
606 607
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
608
#if defined(PADDLE_WITH_HIP)
609
    auto cuda_allocator = CreateCUDAAllocator(p);
610 611 612 613 614 615 616 617 618
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
    CUdevice device;
    int val;
    try {
619
      PADDLE_ENFORCE_GPU_SUCCESS(
620 621
          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

622
      PADDLE_ENFORCE_GPU_SUCCESS(
623 624 625 626 627 628 629 630 631 632 633 634 635
          paddle::platform::dynload::cuDeviceGetAttribute(
              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
              device));
    } catch (...) {
      val = 0;
    }

    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
      allocators_[p] =
          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
              cuda_allocator, platform::GpuMinChunkSize(), p);
    } else {
636
      auto cuda_allocator = CreateCUDAAllocator(p);
637 638 639 640 641
      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
    }

#else
642
    auto cuda_allocator = CreateCUDAAllocator(p);
L
Leo Chen 已提交
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }
674
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
L
Leo Chen 已提交
675
        underlying_allocator, alignment, 0, allow_free_idle_chunk);
676 677
#endif
#endif
S
sneaxiy 已提交
678
  }
679 680 681 682 683 684

  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
685 686 687 688
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<StreamSafeCUDAAllocator>(
        allocator, p, stream,
        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
689 690
  }

691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
  void WrapStreamSafeCUDAAllocatorForDefault() {
    for (auto& pair : allocators_) {
      auto& place = pair.first;
      if (platform::is_gpu_place(place)) {
        std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
            std::make_shared<StreamSafeCUDAAllocator>(
                pair.second, place, /* default_stream = */ nullptr,
                /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
        pair.second = allocator;

        // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
        // ability to interact with the outside world, i.e., change default
        // stream from outside
        default_stream_safe_cuda_allocators_[place] = allocator;
        VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
                << ", allocator address = " << pair.second.get();
      }
    }
  }

711 712 713 714 715 716
  void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                              size_t retry_time) {
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
717
    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
718 719 720
    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
  }

721 722 723 724 725 726 727 728 729
#ifdef PADDLE_WITH_CUDA
  void WrapCUDAGraphAllocator() {
    for (auto& item : allocators_) {
      auto& allocator = item.second;
      allocator = CUDAGraphAllocator::Create(allocator);
    }
  }
#endif

730 731 732 733 734 735 736 737 738
  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
    for (auto& place_pair : allocators) {
      for (auto& stream_pair : place_pair.second) {
        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true,
                          platform::errors::InvalidArgument(
                              "Public allocators must be thread safe"));
      }
    }
  }
739
#endif
S
sneaxiy 已提交
740

741 742 743 744 745 746
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

J
jianghaicheng 已提交
747 748 749 750 751 752
#ifdef PADDLE_WITH_IPU
  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

F
fwenguang 已提交
753 754 755 756 757 758
#ifdef PADDLE_WITH_MLU
  void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

759 760 761 762
#ifdef PADDLE_WITH_ASCEND_CL
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
763 764 765 766 767

  void InitNaiveBestFitNPUPinnedAllocator() {
    allocators_[platform::NPUPinnedPlace()] =
        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
  }
768 769
#endif

770 771 772 773 774 775 776 777 778 779
#ifdef PADDLE_WITH_CUSTOM_DEVICE
  void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }

  void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
                                           bool allow_free_idle_chunk) {
    auto custom_allocator =
        std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
780
        custom_allocator, phi::DeviceManager::GetMinChunkSize(p),
781 782 783 784
        allow_free_idle_chunk);
  }
#endif

785 786 787 788 789 790 791 792
  void InitSystemAllocators() {
    if (!system_allocators_.empty()) return;
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
Z
Zeng Jinle 已提交
793
    }
794
#endif
J
jianghaicheng 已提交
795 796 797 798 799 800 801
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::IPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
802 803 804
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
805
    int device_count = platform::GetGPUDeviceCount();
806 807
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
808
      system_allocators_[p] = CreateCUDAAllocator(p);
809
    }
F
fwenguang 已提交
810 811 812 813
#endif
#ifdef PADDLE_WITH_MLU
    int device_count = platform::GetMLUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
814
      platform::MLUPlace p(i);
F
fwenguang 已提交
815 816
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
817 818
#endif
  }
Z
Zeng Jinle 已提交
819 820

  void InitZeroSizeAllocators() {
821
    if (!zero_size_allocators_.empty()) return;
Z
Zeng Jinle 已提交
822 823
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
824
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
825
    int device_count = platform::GetGPUDeviceCount();
Z
Zeng Jinle 已提交
826 827 828 829 830
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
831 832 833 834 835 836
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
837 838 839 840 841 842
#ifdef PADDLE_WITH_ASCEND_CL
    int device_count = platform::GetNPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::NPUPlace(dev_id));
    }
#endif
J
jianghaicheng 已提交
843 844 845 846 847 848
#ifdef PADDLE_WITH_IPU
    int device_count = platform::GetIPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::IPUPlace(dev_id));
    }
#endif
F
fwenguang 已提交
849 850 851 852 853 854
#ifdef PADDLE_WITH_MLU
    int device_count = platform::GetMLUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::MLUPlace(dev_id));
    }
#endif
855
#ifdef PADDLE_WITH_CUSTOM_DEVICE
856
    auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
857 858
    for (const auto& dev_type : device_types) {
      for (size_t dev_id = 0;
859
           dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) {
860 861 862 863
        places.emplace_back(platform::CustomPlace(dev_type, dev_id));
      }
    }
#endif
Z
Zeng Jinle 已提交
864 865 866

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
867 868
    }
  }
Z
Zeng Jinle 已提交
869

870 871 872 873 874
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true,
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
875
    }
876
  }
877

878 879 880 881
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
882 883 884 885 886
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (FLAGS_use_stream_safe_cuda_allocator) {
      CheckCUDAAllocThreadSafe(cuda_allocators_);
    }
#endif
887 888 889
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
890 891 892 893
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
894 895 896 897 898 899 900
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

901 902
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // a standalone CUDA allocator to support multi-stream GC in new executor
903 904
  std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
      default_stream_safe_cuda_allocators_;
905
  CUDAAllocatorMap cuda_allocators_;
906
  std::shared_timed_mutex cuda_allocator_mutex_;
907 908
#endif
  AllocatorStrategy strategy_;
909
  AllocatorMap allocators_;
910 911
  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
912
  bool allow_free_idle_chunk_;
913
};
914 915 916 917
AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

Y
Refine  
Yu Yang 已提交
918
// Pimpl. Make interface clean.
919
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
920 921 922
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
923 924

AllocatorFacade& AllocatorFacade::Instance() {
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
  static AllocatorFacade* instance = new AllocatorFacade;
  return *instance;
}

AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
    auto id = platform::CUDAGraph::CapturingID();
    auto iter = cuda_graph_map_.find(id);
    PADDLE_ENFORCE_NE(
        iter, cuda_graph_map_.end(),
        platform::errors::PermissionDenied(
            "No memory pool is prepared for CUDA Graph capturing."));
    VLOG(10) << "Choose CUDA Graph memory pool";
    return iter->second.get();
  }
#endif
  return m_;
943 944
}

945 946
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
947 948
  return GetPrivate()->GetAllocator(
      place, /* A non-zero num to choose allocator_ */ 1);
949 950
}

951
void* AllocatorFacade::GetBasePtr(
952
    const std::shared_ptr<phi::Allocation>& allocation) {
953 954 955 956 957 958 959 960 961 962
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for auto_growth "
                        "strategy, not support allocator strategy: %d",
                        static_cast<int>(GetAllocatorStrategy())));
  PADDLE_ENFORCE_EQ(platform::is_gpu_place(allocation->place()), true,
                    paddle::platform::errors::Unimplemented(
                        "GetBasePtr() is only implemented for CUDAPlace(), not "
                        "suppot place: %s",
                        allocation->place()));
963
  return GetPrivate()->GetBasePtr(allocation);
964 965
}

966 967
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
    const platform::Place& place) {
968
  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
969 970
}

971
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
972
    const platform::Place& place, size_t size) {
973
  return std::shared_ptr<phi::Allocation>(Alloc(place, size));
974 975
}

976 977
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
978
  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
979 980
}

W
Wilber 已提交
981
uint64_t AllocatorFacade::Release(const platform::Place& place) {
982 983
  return GetPrivate()
      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
984 985 986
      ->Release(place);
}

987 988
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const phi::Stream& stream) {
989 990 991 992
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
993 994 995
          "multi-stream 'AllocaShared' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
996
  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
997 998
}

999 1000
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                     const phi::Stream& stream) {
1001 1002 1003 1004 1005
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
1006
          "multi-stream 'Alloc' function. To enable it, you can enter"
1007 1008 1009
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));

1010 1011 1012 1013 1014 1015 1016 1017
  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
    return GetPrivate()
        ->GetAllocator(p, s, /* create_if_not_found = */ true)
        ->Allocate(size);
  } else {
    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
1018 1019 1020 1021 1022 1023
  }
#else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
}

1024 1025 1026
bool AllocatorFacade::InSameStream(
    const std::shared_ptr<phi::Allocation>& allocation,
    const phi::Stream& stream) {
1027
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1028 1029 1030 1031
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
1032
          "multi-stream 'InSameStream' function. To enable it, you can enter"
1033 1034
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
1035 1036 1037 1038
  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
  return s == GetStream(allocation);
#else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
1039
#endif
1040 1041
}

1042
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
1043 1044 1045 1046 1047 1048
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                  const gpuStream_t& stream) {
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
1049 1050 1051
          "multi-stream 'Release' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
1052
  return GetPrivate()->GetAllocator(place, stream)->Release(place);
1053 1054
}

1055
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
1056 1057 1058 1059 1060
                                   const gpuStream_t& stream) {
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
1061 1062 1063
          "'RecordStream' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
1064
  GetPrivate()->RecordStream(allocation, stream);
1065 1066
}

1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place, const gpuStream_t& stream) {
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      FLAGS_use_system_allocator == false) {
    return GetPrivate()->GetAllocator(place, stream,
                                      /*create_if_not_found=*/true);
  }
  return GetPrivate()->GetAllocator(
      place, /* A non-zero num to choose allocator_ */ 1);
}

1078
const gpuStream_t& AllocatorFacade::GetStream(
1079
    const std::shared_ptr<phi::Allocation>& allocation) const {
1080 1081 1082 1083 1084 1085 1086
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
          "'GetStream' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
1087
  return GetPrivate()->GetStream(allocation);
1088 1089
}

1090 1091 1092 1093 1094 1095 1096
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
                                       const gpuStream_t& stream) {
  if (FLAGS_use_stream_safe_cuda_allocator) {
    GetPrivate()->SetDefaultStream(place, stream);
  }
}

1097 1098
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
                    platform::errors::InvalidArgument(
                        "CUDA Graph is only supported when the "
                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
                        "FLAGS_allocator_strategy=\"%s\"",
                        FLAGS_allocator_strategy));
  auto& allocator = cuda_graph_map_[id];
  PADDLE_ENFORCE_EQ(
      allocator.get(), nullptr,
      platform::errors::InvalidArgument(
          "The memory pool of the CUDA Graph with ID %d have been prepared.",
          id));
  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
1112 1113
  allocator->SetDefaultStreamFromDeviceContext();

1114
  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
1115 1116 1117
}

void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
1118 1119 1120 1121 1122 1123
  auto iter = cuda_graph_map_.find(id);
  PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
                    platform::errors::InvalidArgument(
                        "Cannot find CUDA Graph with ID = %d", id));
  cuda_graph_map_.erase(iter);
  VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
1124 1125
}
#endif
1126
#endif
1127 1128 1129
}  // namespace allocation
}  // namespace memory
}  // namespace paddle