allocator_facade.cc 11.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/allocator.h"
S
sneaxiy 已提交
16
#include <gflags/gflags.h>
17
#include <map>
Y
Yu Yang 已提交
18
#include <string>
S
sneaxiy 已提交
19
#include <unordered_map>
S
sneaxiy 已提交
20
#include <utility>
21 22 23
#include <vector>
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
Y
Yu Yang 已提交
24
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
S
sneaxiy 已提交
25
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
Y
Yu Yang 已提交
26
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
27
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
Y
Yu Yang 已提交
28
#include "paddle/fluid/memory/allocation/conditional_allocator.h"
29
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
Y
Yu Yang 已提交
30
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
31
#include "paddle/fluid/memory/allocation/locked_allocator.h"
S
sneaxiy 已提交
32
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
S
sneaxiy 已提交
33
#include "paddle/fluid/memory/allocation/retry_allocator.h"
Y
Yu Yang 已提交
34
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
S
sneaxiy 已提交
35
#include "paddle/fluid/platform/cpu_info.h"
S
sneaxiy 已提交
36
#include "paddle/fluid/platform/enforce.h"
37 38 39
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
S
sneaxiy 已提交
40 41 42
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h"
43 44
#endif

S
sneaxiy 已提交
45
DEFINE_int64(
S
sneaxiy 已提交
46 47 48 49
    gpu_allocator_retry_time, 0,
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

S
sneaxiy 已提交
50 51
DEFINE_bool(enable_buffered_allocator, false, "Enable buffered_allocator");

52 53 54 55
namespace paddle {
namespace memory {
namespace allocation {

S
sneaxiy 已提交
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
static inline std::shared_ptr<Allocator> WrapRetryAndBufferedAllocator(
    std::shared_ptr<Allocator> allocator, int64_t retry_time,
    bool enable_buffered) {
  if (retry_time > 0) {
    auto* retry_allocator =
        new RetryAllocator(std::move(allocator), retry_time);
    allocator.reset(retry_allocator);
  }

  if (enable_buffered) {
    allocator.reset(new MultiBinBufferedAllocator(allocator));
  }
  return allocator;
}

Y
Yu Yang 已提交
71
// TODO(yy): Dirty code here. This class should be configurable in runtime.
Y
Yu Yang 已提交
72
class CPUManagedAllocator : public Allocator {
Y
Yu Yang 已提交
73
 public:
Y
Yu Yang 已提交
74
  CPUManagedAllocator() : normal_allocator_(new CPUAllocator()) {}
Y
Yu Yang 已提交
75

Y
Yu Yang 已提交
76
  bool IsAllocThreadSafe() const override { return true; }
Y
Yu Yang 已提交
77

Y
Yu Yang 已提交
78 79 80 81 82
 protected:
  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
    return normal_allocator_->Allocate(size, attr).release();
  }

Y
Yu Yang 已提交
83
 private:
Y
Yu Yang 已提交
84
  std::shared_ptr<Allocator> normal_allocator_;
Y
Yu Yang 已提交
85 86
};

Y
Yu Yang 已提交
87
// TODO(yy): Dirty code here. This class should be configurable in runtime.
Y
Yu Yang 已提交
88
class ChunkedAllocator : public Allocator {
89
 public:
Y
Yu Yang 已提交
90 91 92
  explicit ChunkedAllocator(std::unique_ptr<Allocator> system_allocator,
                            size_t max_chunk_size, size_t capacity = 1,
                            int64_t retry_time = -1)
S
sneaxiy 已提交
93
      : max_chunk_size_(max_chunk_size), retry_time_(retry_time) {
Y
Yu Yang 已提交
94
    raw_allocator_ = std::move(system_allocator);
S
sneaxiy 已提交
95 96 97 98 99

    if (max_chunk_size_ == 0) {
      default_allocator_ = raw_allocator_;
    } else {
      if (capacity == 1) {
M
minqiyang 已提交
100 101
        VLOG(1) << "Create BestFitAllocator with chunk_size "
                << max_chunk_size_;
Y
Yu Yang 已提交
102
        default_allocator_ = CreateAllocatorWithChunk();
S
sneaxiy 已提交
103
      } else {
M
minqiyang 已提交
104 105
        VLOG(1) << "Create AutoIncrementAllocator with chunk_size "
                << max_chunk_size_ << " and capacity " << capacity;
S
sneaxiy 已提交
106
        default_allocator_ = std::make_shared<AutoIncrementAllocator>(
G
Gabor Buella 已提交
107
            [this] { return CreateAllocatorWithChunk(); }, capacity);
S
sneaxiy 已提交
108 109
      }
    }
Y
Yu Yang 已提交
110 111 112 113 114 115 116 117 118 119 120 121

    auto* cond_allocator = new ConditionalAllocator();
    cond_allocator
        ->AddAllocator(
            [this](size_t size, Attr attr) { return size < max_chunk_size_; },
            default_allocator_)
        .AddAllocator(
            [](size_t size, Attr attr) {
              return true;  // default case
            },
            raw_allocator_);
    default_allocator_.reset(cond_allocator);
Y
Yu Yang 已提交
122
  }
123

Y
Yu Yang 已提交
124
  ~ChunkedAllocator() override {
125
    // Specify destruct order.
Y
Yu Yang 已提交
126 127 128 129 130
    default_allocator_.reset();
    chunks_.clear();
    raw_allocator_.reset();
  }

Y
Yu Yang 已提交
131
  std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
Y
Yu Yang 已提交
132 133
    chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
    auto* allocation = chunks_.back().get();
S
sneaxiy 已提交
134 135
    std::shared_ptr<Allocator> allocator(new LockedAllocator(
        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
S
sneaxiy 已提交
136

S
sneaxiy 已提交
137 138
    allocator = WrapRetryAndBufferedAllocator(allocator, retry_time_,
                                              FLAGS_enable_buffered_allocator);
Y
Yu Yang 已提交
139

S
sneaxiy 已提交
140
    return std::make_shared<AlignedAllocator<4096>>(std::move(allocator));
141
  }
S
sneaxiy 已提交
142

Y
Yu Yang 已提交
143 144
  bool IsAllocThreadSafe() const override { return true; }

Y
Yu Yang 已提交
145 146 147 148 149
 protected:
  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
    return default_allocator_->Allocate(size, attr).release();
  }

S
sneaxiy 已提交
150
 protected:
Y
Yu Yang 已提交
151
  size_t max_chunk_size_;
S
sneaxiy 已提交
152
  int64_t retry_time_;
Y
Yu Yang 已提交
153
  std::vector<AllocationPtr> chunks_;
Y
Yu Yang 已提交
154 155
  std::shared_ptr<Allocator> raw_allocator_;
  std::shared_ptr<Allocator> default_allocator_;
Y
Yu Yang 已提交
156
};
S
sneaxiy 已提交
157 158 159

#ifdef PADDLE_WITH_CUDA

Y
Yu Yang 已提交
160
class CUDAChunkedAllocator : public ChunkedAllocator {
S
sneaxiy 已提交
161
 public:
Y
Yu Yang 已提交
162 163 164 165 166
  explicit CUDAChunkedAllocator(int dev_id)
      : ChunkedAllocator(std::unique_ptr<Allocator>(
                             new CUDAAllocator(platform::CUDAPlace(dev_id))),
                         GetMaxChunkSize(dev_id), GetCapcity(dev_id),
                         GetRetryTime()) {}
S
sneaxiy 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184

 private:
  static size_t GetMaxChunkSize(int dev_id) {
    platform::CUDADeviceGuard guard(dev_id);
    return platform::GpuMaxChunkSize();
  }

  static size_t GetCapcity(int dev_id) {
    platform::CUDADeviceGuard guard(dev_id);
    size_t available, total;
    platform::GpuMemoryUsage(&available, &total);
    size_t max_chunk_size = platform::GpuMaxChunkSize();
    return max_chunk_size == 0 ? 0 : available / max_chunk_size;
  }

  static int64_t GetRetryTime() { return FLAGS_gpu_allocator_retry_time; }
};

Y
Yu Yang 已提交
185
class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
S
sneaxiy 已提交
186
 public:
Y
Yu Yang 已提交
187 188 189 190
  CUDAPinnedChunkedAllocator()
      : ChunkedAllocator(std::unique_ptr<Allocator>(new CPUPinnedAllocator()),
                         platform::CUDAPinnedMaxChunkSize(), GetCapacity(),
                         -1) {}  // never retry
S
sneaxiy 已提交
191 192 193 194 195 196 197 198 199

 private:
  static size_t GetCapacity() {
    size_t total = platform::CpuTotalPhysicalMemory();
    size_t max_chunk_size = platform::CUDAPinnedMaxChunkSize();
    return max_chunk_size == 0 ? 0 : total / max_chunk_size;
  }
};

Y
Refine  
Yu Yang 已提交
200
#endif
Y
Yu Yang 已提交
201 202 203

class AllocatorFacadePrivate {
 public:
Y
Yu Yang 已提交
204
  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
Y
Yu Yang 已提交
205

Y
Refine  
Yu Yang 已提交
206
  ~AllocatorFacadePrivate() = default;
207 208

  AllocatorFacadePrivate() {
S
sneaxiy 已提交
209 210 211 212 213 214 215 216 217 218 219 220 221 222
    auto strategy = GetAllocatorStrategy();
    switch (strategy) {
      case AllocatorStrategy::kLegacy: {
        InitLegacyAllocator();
        break;
      }
      case AllocatorStrategy::kNaiveBestFit: {
        InitCPUAllocator();
        InitCUDAAllocator();
        InitCUDAPinnedAllocator();
        WrapZeroSizeAllocator();
        break;
      }
      case AllocatorStrategy::kAutoGrowthBestFit: {
S
sneaxiy 已提交
223
        InitAutoGrowthCPUAllocator();
S
sneaxiy 已提交
224 225 226 227 228 229 230 231 232
        InitAutoGrowthCUDAAllocator();
        InitAutoGrowthCUDAPinnedAllocator();
        WrapZeroSizeAllocator();
        break;
      }
      default: {
        PADDLE_THROW("Unsupported allocator strategy: %d",
                     static_cast<int>(strategy));
      }
Y
Yu Yang 已提交
233
    }
234 235 236
  }

 private:
S
sneaxiy 已提交
237 238 239 240 241 242 243 244
  void InitAutoGrowthCPUAllocator() {
    auto cpu_allocator = std::make_shared<AlignedAllocator<4096>>(
        std::make_shared<CPUAllocator>());
    allocators_[platform::CPUPlace()] =
        std::make_shared<AutoGrowthBestFitAllocator>(
            cpu_allocator, platform::CpuMaxChunkSize(), 4096);
  }

S
sneaxiy 已提交
245 246 247 248 249 250
  void InitAutoGrowthCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
    int dev_cnt = platform::GetCUDADeviceCount();
    for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
      auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
          std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
S
sneaxiy 已提交
251 252 253 254 255
      auto allocator = std::make_shared<AutoGrowthBestFitAllocator>(
          cuda_allocator, platform::GpuMaxChunkSize(), 4096);

      allocators_[platform::CUDAPlace(dev_id)] = WrapRetryAndBufferedAllocator(
          allocator, FLAGS_gpu_allocator_retry_time, false);
S
sneaxiy 已提交
256 257 258 259 260 261 262 263 264
    }
#endif
  }

  void InitAutoGrowthCUDAPinnedAllocator() {
#ifdef PADDLE_WITH_CUDA
    auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
        std::make_shared<CPUPinnedAllocator>());
    allocators_[platform::CUDAPinnedPlace()] =
S
sneaxiy 已提交
265
        std::make_shared<AutoGrowthBestFitAllocator>(
S
sneaxiy 已提交
266 267 268 269
            cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
#endif
  }

Y
Yu Yang 已提交
270 271 272 273 274 275
  void InitLegacyAllocator() {
    std::vector<platform::Place> places{platform::CPUPlace()};
#ifdef PADDLE_WITH_CUDA
    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
Y
Yu Yang 已提交
276
    places.emplace_back(platform::CUDAPinnedPlace());
Y
Yu Yang 已提交
277 278 279 280 281 282
#endif
    for (auto& p : places) {
      allocators_[p] = std::make_shared<LegacyAllocator>(p);
    }
  }

283
  void InitCPUAllocator() {
Y
Yu Yang 已提交
284
    allocators_[platform::CPUPlace()] = std::make_shared<CPUManagedAllocator>();
285 286 287 288
  }

  void InitCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
S
sneaxiy 已提交
289 290
    int device_count = platform::GetCUDADeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
291
      allocators_[platform::CUDAPlace(dev_id)] =
Y
Yu Yang 已提交
292
          std::make_shared<CUDAChunkedAllocator>(dev_id);
293 294 295
    }
#endif
  }
Y
Yu Yang 已提交
296

S
sneaxiy 已提交
297 298 299
  void InitCUDAPinnedAllocator() {
#ifdef PADDLE_WITH_CUDA
    allocators_[platform::CUDAPinnedPlace()] =
Y
Yu Yang 已提交
300
        std::make_shared<CUDAPinnedChunkedAllocator>();
S
sneaxiy 已提交
301 302 303
#endif
  }

Y
Yu Yang 已提交
304 305 306 307 308 309
  void WrapZeroSizeAllocator() {
    for (auto& pair : allocators_) {
      pair.second =
          std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
    }
  }
310 311
};

Y
Refine  
Yu Yang 已提交
312
// Pimpl. Make interface clean.
313 314 315 316 317 318 319 320 321 322
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade::~AllocatorFacade() { delete m_; }

AllocatorFacade& AllocatorFacade::Instance() {
  static AllocatorFacade instance;
  return instance;
}

std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, Allocator::Attr attr) {
S
sneaxiy 已提交
323
  return std::shared_ptr<Allocation>(Alloc(place, size, attr));
324 325
}

Y
Yu Yang 已提交
326 327
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                     Allocator::Attr attr) {
Y
Yu Yang 已提交
328 329 330 331 332
  auto it = m_->allocators_.find(place);
  if (it == m_->allocators_.end()) {
    throw BadAlloc(
        string::Sprintf("No such allocator for the place, %s", place));
  }
S
sneaxiy 已提交
333
  return m_->allocators_.at(place)->Allocate(size, attr);
334 335 336 337 338
}

}  // namespace allocation
}  // namespace memory
}  // namespace paddle