allocator_facade.cc 15.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "gflags/gflags.h"
18
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
19
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
20
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
21
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
22
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
23 24 25
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
S
sneaxiy 已提交
26
#include "paddle/fluid/memory/allocation/retry_allocator.h"
S
sneaxiy 已提交
27
#include "paddle/fluid/platform/enforce.h"
28
#include "paddle/fluid/platform/place.h"
29
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
30
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
S
sneaxiy 已提交
31
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
32
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
S
sneaxiy 已提交
33
#include "paddle/fluid/platform/gpu_info.h"
34
#endif
35 36 37
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_graph.h"
#endif
38
#ifdef PADDLE_WITH_XPU
Q
QingshuChen 已提交
39
#include "paddle/fluid/platform/xpu/xpu_info.h"
40
#endif
41
#include "paddle/fluid/platform/npu_info.h"
42

Z
Zeng Jinle 已提交
43
PADDLE_DEFINE_EXPORTED_int64(
44
    gpu_allocator_retry_time, 10000,
S
sneaxiy 已提交
45 46 47
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
48 49 50 51
PADDLE_DEFINE_EXPORTED_bool(
    use_system_allocator, false,
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
52

53 54
DECLARE_string(allocator_strategy);

55 56 57 58
namespace paddle {
namespace memory {
namespace allocation {

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
#ifdef PADDLE_WITH_CUDA
class CUDAGraphAllocator
    : public Allocator,
      public std::enable_shared_from_this<CUDAGraphAllocator> {
 private:
  class PrivateAllocation : public Allocation {
   public:
    PrivateAllocation(CUDAGraphAllocator* allocator,
                      AllocationPtr underlying_allocation)
        : Allocation(underlying_allocation->ptr(),
                     underlying_allocation->size(),
                     underlying_allocation->place()),
          allocator_(allocator->shared_from_this()),
          underlying_allocation_(std::move(underlying_allocation)) {}

   private:
    std::shared_ptr<Allocator> allocator_;
    AllocationPtr underlying_allocation_;
  };

  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
      : underlying_allocator_(allocator) {}

 public:
  static std::shared_ptr<Allocator> Create(
      const std::shared_ptr<Allocator>& allocator) {
    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
  }

 protected:
  Allocation* AllocateImpl(size_t size) {
    VLOG(10) << "Allocate " << size << " for CUDA Graph";
    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
  }

  void FreeImpl(Allocation* allocation) {
    VLOG(10) << "delete for CUDA Graph";
    delete allocation;
  }

 private:
  std::shared_ptr<Allocator> underlying_allocator_;
};
#endif

Y
Yu Yang 已提交
104 105
class AllocatorFacadePrivate {
 public:
106 107
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

108 109 110
  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
    strategy_ = GetAllocatorStrategy();
    switch (strategy_) {
111 112
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
113 114 115 116 117
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
118
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
119 120 121 122 123
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
124 125 126 127 128
#endif
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
129
        InitNaiveBestFitNPUPinnedAllocator();
130
#endif
Z
Zeng Jinle 已提交
131 132
        break;
      }
133 134 135

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
136 137 138 139 140
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
141
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
142 143
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
144 145
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                      allow_free_idle_chunk);
146 147 148
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
Z
Zeng Jinle 已提交
149 150
        break;
      }
151

152 153
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
154 155 156 157 158
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
159
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
160 161 162 163 164 165 166 167 168
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
        break;
      }

Z
Zeng Jinle 已提交
169
      default: {
170
        PADDLE_THROW(platform::errors::InvalidArgument(
171
            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
Z
Zeng Jinle 已提交
172
      }
Y
Yu Yang 已提交
173
    }
Z
Zeng Jinle 已提交
174
    InitZeroSizeAllocators();
175
    InitSystemAllocators();
176 177 178 179 180 181

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

    CheckAllocThreadSafe();
Z
Zeng Jinle 已提交
182 183
  }

184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
  inline const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
      auto id = platform::CUDAGraph::CapturingID();
      auto iter = cuda_graph_allocator_map_.find(id);
      PADDLE_ENFORCE_NE(
          iter, cuda_graph_allocator_map_.end(),
          platform::errors::PermissionDenied(
              "No memory pool is prepared for CUDA Graph capturing."));
      return iter->second->allocators_;
    } else {
      return allocators_;
    }
#else
    return allocators_;
#endif
  }

Z
Zeng Jinle 已提交
202 203
  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
204 205
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
206
                                                          : GetAllocatorMap())
207
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
208
    auto iter = allocators.find(place);
209 210 211
    PADDLE_ENFORCE_NE(iter, allocators.end(),
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
212
    return iter->second;
213 214 215
  }

 private:
216
  void InitSystemAllocators() {
217
    if (!system_allocators_.empty()) return;
218
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
219 220 221 222 223 224 225
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
226
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
227 228 229 230 231 232 233 234 235 236
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
    int device_count = platform::GetCUDADeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
    }
#endif
  }

237 238 239
  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
Y
Yu Yang 已提交
240 241
  }

242
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
243 244 245
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
246 247
  }

248 249
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
250
  }
Y
Yu Yang 已提交
251

252 253 254 255
  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

256 257
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                   bool allow_free_idle_chunk) {
258 259
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
260
        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
S
sneaxiy 已提交
261
  }
262
#endif
S
sneaxiy 已提交
263

264 265 266 267 268 269
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

270 271 272 273
#ifdef PADDLE_WITH_ASCEND_CL
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
274 275 276 277 278 279

  void InitNaiveBestFitNPUPinnedAllocator() {
    allocators_[platform::NPUPinnedPlace()] =
        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
  }

280 281
#endif

Z
Zeng Jinle 已提交
282 283 284 285
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}

286 287
    bool IsAllocThreadSafe() const override { return true; }

Z
Zeng Jinle 已提交
288
   protected:
289
    Allocation* AllocateImpl(size_t size) override {
Z
Zeng Jinle 已提交
290 291 292 293 294 295 296 297 298 299
      return new Allocation(nullptr, 0, place_);
    }

    void FreeImpl(Allocation* allocation) override { delete allocation; }

   private:
    platform::Place place_;
  };

  void InitZeroSizeAllocators() {
300
    if (!zero_size_allocators_.empty()) return;
Z
Zeng Jinle 已提交
301 302
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
303
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
304 305 306 307 308 309
    int device_count = platform::GetCUDADeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
310 311 312 313 314 315
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
316 317 318 319 320 321
#ifdef PADDLE_WITH_ASCEND_CL
    int device_count = platform::GetNPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::NPUPlace(dev_id));
    }
#endif
Z
Zeng Jinle 已提交
322 323 324

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
325 326
    }
  }
Z
Zeng Jinle 已提交
327

328 329 330 331 332
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true,
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
333
    }
334
  }
335

336 337 338 339
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
340 341 342
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
343 344 345 346
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
347 348 349 350 351 352 353
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
#ifdef PADDLE_WITH_CUDA

 public:
  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
                      platform::errors::InvalidArgument(
                          "CUDA Graph is only supported when the "
                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
                          "FLAGS_allocator_strategy=\"%s\"",
                          FLAGS_allocator_strategy));
    auto& allocator = cuda_graph_allocator_map_[id];
    PADDLE_ENFORCE_EQ(
        allocator.get(), nullptr,
        platform::errors::InvalidArgument(
            "The memory pool of the CUDA Graph with ID %d have been prepared.",
            id));
    allocator.reset(
        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
    for (auto& item : allocator->allocators_) {
      auto& old_allocator = item.second;
      old_allocator = CUDAGraphAllocator::Create(old_allocator);
    }
    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
  }

  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
    auto iter = cuda_graph_allocator_map_.find(id);
    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
                      platform::errors::InvalidArgument(
                          "Cannot find CUDA Graph with ID = %d", id));
    cuda_graph_allocator_map_.erase(iter);
    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
  }
#endif

Z
Zeng Jinle 已提交
389
 private:
390
  AllocatorMap allocators_;
391 392 393 394 395 396 397 398
#ifdef PADDLE_WITH_CUDA
  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
      cuda_graph_allocator_map_;
#endif
  AllocatorStrategy strategy_;

  static AllocatorMap zero_size_allocators_;
  static AllocatorMap system_allocators_;
399 400
};

401 402 403 404
AllocatorFacadePrivate::AllocatorMap
    AllocatorFacadePrivate::zero_size_allocators_;
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;

Y
Refine  
Yu Yang 已提交
405
// Pimpl. Make interface clean.
406
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
407 408 409
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
410 411 412 413 414 415 416

AllocatorFacade& AllocatorFacade::Instance() {
  static AllocatorFacade instance;
  return instance;
}

std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
417 418
    const platform::Place& place, size_t size) {
  return std::shared_ptr<Allocation>(Alloc(place, size));
419 420
}

421 422 423
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
  return m_->GetAllocator(place, size)->Allocate(size);
424 425
}

W
Wilber 已提交
426 427
uint64_t AllocatorFacade::Release(const platform::Place& place) {
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
428 429 430
      ->Release(place);
}

431 432 433 434 435
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}

436 437 438 439 440 441 442 443 444 445
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
  return m_->PrepareMemoryPoolForCUDAGraph(id);
}

void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
  return m_->RemoveMemoryPoolOfCUDAGraph(id);
}
#endif

446 447 448
}  // namespace allocation
}  // namespace memory
}  // namespace paddle