allocator_facade.cc 16.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "gflags/gflags.h"
18
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
19
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
20
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
21
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
22
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
23 24 25
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
L
Leo Chen 已提交
26
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
S
sneaxiy 已提交
27
#include "paddle/fluid/memory/allocation/retry_allocator.h"
S
sneaxiy 已提交
28
#include "paddle/fluid/platform/enforce.h"
29
#include "paddle/fluid/platform/place.h"
30
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
31
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
S
sneaxiy 已提交
32
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
33
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
S
sneaxiy 已提交
34
#include "paddle/fluid/platform/gpu_info.h"
35
#endif
36 37 38
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_graph.h"
#endif
39
#ifdef PADDLE_WITH_XPU
Q
QingshuChen 已提交
40
#include "paddle/fluid/platform/xpu/xpu_info.h"
41
#endif
42
#include "paddle/fluid/platform/npu_info.h"
43

Z
Zeng Jinle 已提交
44
PADDLE_DEFINE_EXPORTED_int64(
45
    gpu_allocator_retry_time, 10000,
S
sneaxiy 已提交
46 47 48
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
49 50 51 52
PADDLE_DEFINE_EXPORTED_bool(
    use_system_allocator, false,
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
53

54 55
DECLARE_string(allocator_strategy);

56 57 58 59
namespace paddle {
namespace memory {
namespace allocation {

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
    : public Allocator,
      public std::enable_shared_from_this<CUDAGraphAllocator> {
 private:
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
                      AllocationPtr underlying_allocation)
        : Allocation(underlying_allocation->ptr(),
                     underlying_allocation->size(),
                     underlying_allocation->place()),
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}

   private:
    std::shared_ptr<Allocator> allocator_;
    AllocationPtr underlying_allocation_;
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
      : underlying_allocator_(allocator) {}

 public:
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
  }

 protected:
  Allocation* AllocateImpl(size_t size) {
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
  }

  void FreeImpl(Allocation* allocation) {
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }

 private:
  std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

Y
Yu Yang 已提交
105 106
class AllocatorFacadePrivate {
 public:
107 108
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

109 110 111
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
    switch (strategy_) {
112 113
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
114 115 116 117 118
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
119
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
120 121 122 123 124
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
125 126 127 128 129
#endif
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
130
        InitNaiveBestFitNPUPinnedAllocator();
131
#endif
Z
Zeng Jinle 已提交
132 133
        break;
      }
134 135 136

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
137 138 139 140 141
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
142
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
143 144
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
145 146
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                      allow_free_idle_chunk);
147 148 149
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
Z
Zeng Jinle 已提交
150 151
        break;
      }
152

153 154
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
155 156 157 158 159
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
160
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
161 162 163 164 165 166 167 168 169
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
        break;
      }

Z
Zeng Jinle 已提交
170
      default: {
171
        PADDLE_THROW(platform::errors::InvalidArgument(
172
            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
Z
Zeng Jinle 已提交
173
      }
Y
Yu Yang 已提交
174
    }
Z
Zeng Jinle 已提交
175
    InitZeroSizeAllocators();
176
    InitSystemAllocators();
177 178 179 180 181 182

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

    CheckAllocThreadSafe();
Z
Zeng Jinle 已提交
183 184
  }

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
  inline const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
      auto id = platform::CUDAGraph::CapturingID();
      auto iter = cuda_graph_allocator_map_.find(id);
      PADDLE_ENFORCE_NE(
          iter, cuda_graph_allocator_map_.end(),
          platform::errors::PermissionDenied(
              "No memory pool is prepared for CUDA Graph capturing."));
      return iter->second->allocators_;
    } else {
      return allocators_;
    }
#else
    return allocators_;
#endif
  }

Z
Zeng Jinle 已提交
203 204
  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
L
Leo Chen 已提交
205 206
    VLOG(4) << "GetAllocator"
            << " " << place << " " << size;
207 208
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
209
                                                          : GetAllocatorMap())
210
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
211
    auto iter = allocators.find(place);
212 213 214
    PADDLE_ENFORCE_NE(iter, allocators.end(),
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
215
    return iter->second;
216 217 218
  }

 private:
219
  void InitSystemAllocators() {
220
    if (!system_allocators_.empty()) return;
221
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
222 223 224 225 226 227 228
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
229
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
230 231 232 233 234 235 236 237 238 239
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
    int device_count = platform::GetCUDADeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
    }
#endif
  }

240 241 242
  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
Y
Yu Yang 已提交
243 244
  }

245
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
246 247 248
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
249 250
  }

251 252
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
253
  }
Y
Yu Yang 已提交
254

255 256 257 258
  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

259 260
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
261
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
L
Leo Chen 已提交
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
    auto alignment = platform::GpuMinChunkSize();
    bool need_addr_align = true;
    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
    // API in that case may got cuda error(3), i.e.,
    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
    // but not really used.
    // Here, the try-catch block is added to handle the case that
    // GetDeviceProperties() may failed in the multiple process(for example, in
    // dataloader with num_worker > 0)
    try {
      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
      need_addr_align = prop.textureAlignment < alignment;
      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
              << prop.textureAlignment
              << ", set need_addr_align=" << need_addr_align;
    } catch (...) {
      need_addr_align = true;
      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
    }
    // The address returned is aligned already,
    // ref:
    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
    std::shared_ptr<Allocator> underlying_allocator{nullptr};
    if (need_addr_align) {
      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
      underlying_allocator =
          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
    } else {
      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
      underlying_allocator = cuda_allocator;
    }
293
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
L
Leo Chen 已提交
294
        underlying_allocator, alignment, 0, allow_free_idle_chunk);
S
sneaxiy 已提交
295
  }
296
#endif
S
sneaxiy 已提交
297

298 299 300 301 302 303
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

304 305 306 307
#ifdef PADDLE_WITH_ASCEND_CL
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
308 309 310 311 312 313

  void InitNaiveBestFitNPUPinnedAllocator() {
    allocators_[platform::NPUPinnedPlace()] =
        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
  }

314 315
#endif

Z
Zeng Jinle 已提交
316 317 318 319
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}

320 321
    bool IsAllocThreadSafe() const override { return true; }

Z
Zeng Jinle 已提交
322
   protected:
323
    Allocation* AllocateImpl(size_t size) override {
Z
Zeng Jinle 已提交
324 325 326 327 328 329 330 331 332 333
      return new Allocation(nullptr, 0, place_);
    }

    void FreeImpl(Allocation* allocation) override { delete allocation; }

   private:
    platform::Place place_;
  };

  void InitZeroSizeAllocators() {
334
    if (!zero_size_allocators_.empty()) return;
Z
Zeng Jinle 已提交
335 336
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
337
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
338 339 340 341 342 343
    int device_count = platform::GetCUDADeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
344 345 346 347 348 349
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
350 351 352 353 354 355
#ifdef PADDLE_WITH_ASCEND_CL
    int device_count = platform::GetNPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::NPUPlace(dev_id));
    }
#endif
Z
Zeng Jinle 已提交
356 357 358

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
359 360
    }
  }
Z
Zeng Jinle 已提交
361

362 363 364 365 366
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true,
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
367
    }
368
  }
369

370 371 372 373
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
374 375 376
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
377 378 379 380
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
381 382 383 384 385 386 387
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
#ifdef PADDLE_WITH_CUDA

 public:
  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
                      platform::errors::InvalidArgument(
                          "CUDA Graph is only supported when the "
                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
                          "FLAGS_allocator_strategy=\"%s\"",
                          FLAGS_allocator_strategy));
    auto& allocator = cuda_graph_allocator_map_[id];
    PADDLE_ENFORCE_EQ(
        allocator.get(), nullptr,
        platform::errors::InvalidArgument(
            "The memory pool of the CUDA Graph with ID %d have been prepared.",
            id));
    allocator.reset(
        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
    for (auto& item : allocator->allocators_) {
      auto& old_allocator = item.second;
      old_allocator = CUDAGraphAllocator::Create(old_allocator);
    }
    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
  }

  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
    auto iter = cuda_graph_allocator_map_.find(id);
    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
                      platform::errors::InvalidArgument(
                          "Cannot find CUDA Graph with ID = %d", id));
    cuda_graph_allocator_map_.erase(iter);
    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
  }
#endif

Z
Zeng Jinle 已提交
423
 private:
424
  AllocatorMap allocators_;
425 426 427 428 429 430 431 432
#ifdef PADDLE_WITH_CUDA
  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
      cuda_graph_allocator_map_;
#endif
  AllocatorStrategy strategy_;

  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
433 434
};

435 436 437 438
AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

Y
Refine  
Yu Yang 已提交
439
// Pimpl. Make interface clean.
440
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
441 442 443
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
444 445 446 447 448 449 450

AllocatorFacade& AllocatorFacade::Instance() {
  static AllocatorFacade instance;
  return instance;
}

std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
451 452
    const platform::Place& place, size_t size) {
  return std::shared_ptr<Allocation>(Alloc(place, size));
453 454
}

455 456 457
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
  return m_->GetAllocator(place, size)->Allocate(size);
458 459
}

W
Wilber 已提交
460 461
uint64_t AllocatorFacade::Release(const platform::Place& place) {
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
462 463 464
      ->Release(place);
}

465 466 467 468 469
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}

470 471 472 473 474 475 476 477 478 479
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
  return m_->PrepareMemoryPoolForCUDAGraph(id);
}

void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
  return m_->RemoveMemoryPoolOfCUDAGraph(id);
}
#endif

480 481 482
}  // namespace allocation
}  // namespace memory
}  // namespace paddle