allocator_facade.cc 7.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/allocator.h"
S
sneaxiy 已提交
16
#include <gflags/gflags.h>
17
#include <map>
S
sneaxiy 已提交
18
#include <unordered_map>
19 20 21
#include <vector>
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
Y
Yu Yang 已提交
22
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
23
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
Y
Yu Yang 已提交
24
#include "paddle/fluid/memory/allocation/conditional_allocator.h"
25 26 27
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
Y
Yu Yang 已提交
28
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
S
sneaxiy 已提交
29
#include "paddle/fluid/memory/allocation/retry_allocator.h"
Y
Yu Yang 已提交
30
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
Y
Yu Yang 已提交
31
#include "paddle/fluid/platform/cuda_device_guard.h"
32 33 34 35 36 37
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#endif

S
sneaxiy 已提交
38 39 40 41 42
DEFINE_int32(
    gpu_allocator_retry_time, 0,
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

43 44 45 46
namespace paddle {
namespace memory {
namespace allocation {

Y
Yu Yang 已提交
47
// TODO(yy): Dirty code here. This class should be configurable in runtime.
Y
Yu Yang 已提交
48 49 50 51 52 53 54 55 56
class CPUManagedAllocator : public ManagedAllocator {
 public:
  CPUManagedAllocator()
      : normal_allocator_(NaiveManagedAllocator::Create(
            std::unique_ptr<Allocator>(new CPUAllocator()))),
        communication_allocator_(NaiveManagedAllocator::Create(
            std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}

  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
Y
Yu Yang 已提交
57
    if (attr == kCrossDevice) {
Y
Yu Yang 已提交
58 59 60 61 62 63 64
      return communication_allocator_->Allocate(size, attr);
    } else {
      return normal_allocator_->Allocate(size, attr);
    }
  }

  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
Y
Yu Yang 已提交
65
    if (attr == kCrossDevice) {
Y
Yu Yang 已提交
66 67 68 69 70
      return communication_allocator_->AllocateShared(size, attr);
    } else {
      return normal_allocator_->AllocateShared(size, attr);
    }
  }
S
sneaxiy 已提交
71

Y
Yu Yang 已提交
72
  bool IsAllocThreadSafe() const override { return true; }
Y
Yu Yang 已提交
73 74 75 76 77 78

 private:
  std::shared_ptr<ManagedAllocator> normal_allocator_;
  std::shared_ptr<ManagedAllocator> communication_allocator_;
};

Y
Refine  
Yu Yang 已提交
79
#ifdef PADDLE_WITH_CUDA
Y
Yu Yang 已提交
80 81
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class CUDAManagedAllocator : public ManagedAllocator {
82
 public:
Y
Yu Yang 已提交
83 84 85
  explicit CUDAManagedAllocator(int dev_id) {
    platform::CUDADeviceGuard guard(dev_id);
    max_chunk_size_ = platform::GpuMaxChunkSize();
S
sneaxiy 已提交
86

Y
Yu Yang 已提交
87 88
    raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
        new CUDAAllocator(platform::CUDAPlace(dev_id))));
S
sneaxiy 已提交
89 90 91 92 93 94 95 96 97

    if (max_chunk_size_ == 0) {
      default_allocator_ = raw_allocator_;
    } else {
      size_t available, total;
      platform::GpuMemoryUsage(&available, &total);
      size_t capacity = available / max_chunk_size_;

      if (capacity == 1) {
S
sneaxiy 已提交
98 99
        VLOG(10) << "Create BestFitAllocator with chunk_size "
                 << max_chunk_size_;
S
sneaxiy 已提交
100 101
        default_allocator_ = BestFitAllocatorCreator();
      } else {
S
sneaxiy 已提交
102 103
        VLOG(10) << "Create AutoIncrementAllocator with chunk_size "
                 << max_chunk_size_ << " and capacity " << capacity;
S
sneaxiy 已提交
104 105 106 107
        default_allocator_ = std::make_shared<AutoIncrementAllocator>(
            [this] { return std::move(BestFitAllocatorCreator()); }, capacity);
      }
    }
Y
Yu Yang 已提交
108 109 110 111 112 113 114 115 116 117 118 119

    auto* cond_allocator = new ConditionalAllocator();
    cond_allocator
        ->AddAllocator(
            [this](size_t size, Attr attr) { return size < max_chunk_size_; },
            default_allocator_)
        .AddAllocator(
            [](size_t size, Attr attr) {
              return true;  // default case
            },
            raw_allocator_);
    default_allocator_.reset(cond_allocator);
Y
Yu Yang 已提交
120
  }
121

Y
Yu Yang 已提交
122
  ~CUDAManagedAllocator() {
123
    // Specify destruct order.
Y
Yu Yang 已提交
124 125 126 127 128 129 130 131
    default_allocator_.reset();
    chunks_.clear();
    raw_allocator_.reset();
  }

  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
    return default_allocator_->Allocate(size, attr);
  }
S
sneaxiy 已提交
132

Y
Yu Yang 已提交
133 134 135 136 137 138 139
  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
    return default_allocator_->AllocateShared(size, attr);
  }

  std::shared_ptr<ManagedAllocator> BestFitAllocatorCreator() {
    chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
    auto* allocation = chunks_.back().get();
S
sneaxiy 已提交
140 141 142 143 144 145 146 147 148 149 150 151 152 153
    std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator(
        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));

    if (FLAGS_gpu_allocator_retry_time <= 0) {
      VLOG(10) << "Create NaiveManagedAllocator without retry";
      return std::make_shared<AlignedAllocator<64u>>(
          NaiveManagedAllocator::Create(std::move(unmanaged_allocator)));
    } else {
      VLOG(10) << "Create RetryAllocator with retry_time "
               << FLAGS_gpu_allocator_retry_time << "ms";
      return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create(
          std::move(unmanaged_allocator),
          static_cast<size_t>(FLAGS_gpu_allocator_retry_time)));
    }
154
  }
S
sneaxiy 已提交
155

Y
Yu Yang 已提交
156 157 158 159 160 161 162 163
  bool IsAllocThreadSafe() const override { return true; }

 private:
  size_t max_chunk_size_;
  std::vector<std::unique_ptr<Allocation>> chunks_;
  std::shared_ptr<ManagedAllocator> raw_allocator_;
  std::shared_ptr<ManagedAllocator> default_allocator_;
};
Y
Refine  
Yu Yang 已提交
164
#endif
Y
Yu Yang 已提交
165 166 167

class AllocatorFacadePrivate {
 public:
S
sneaxiy 已提交
168 169
  std::unordered_map<platform::Place, std::shared_ptr<ManagedAllocator>>
      allocators_;
Y
Yu Yang 已提交
170

Y
Refine  
Yu Yang 已提交
171
  ~AllocatorFacadePrivate() = default;
172 173 174 175

  AllocatorFacadePrivate() {
    InitCPUAllocator();
    InitCUDAAllocator();
Y
Yu Yang 已提交
176
    WrapZeroSizeAllocator();
177 178 179 180
  }

 private:
  void InitCPUAllocator() {
Y
Yu Yang 已提交
181
    allocators_[platform::CPUPlace()] = std::make_shared<CPUManagedAllocator>();
182 183 184 185 186 187
  }

  void InitCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
      allocators_[platform::CUDAPlace(dev_id)] =
Y
Yu Yang 已提交
188
          std::make_shared<CUDAManagedAllocator>(dev_id);
189 190 191
    }
#endif
  }
Y
Yu Yang 已提交
192 193 194 195 196 197 198

  void WrapZeroSizeAllocator() {
    for (auto& pair : allocators_) {
      pair.second =
          std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
    }
  }
199 200
};

Y
Refine  
Yu Yang 已提交
201
// Pimpl. Make interface clean.
202 203 204 205 206 207 208 209 210 211
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade::~AllocatorFacade() { delete m_; }

AllocatorFacade& AllocatorFacade::Instance() {
  static AllocatorFacade instance;
  return instance;
}

std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, Allocator::Attr attr) {
S
sneaxiy 已提交
212
  return m_->allocators_.at(place)->AllocateShared(size, attr);
213 214 215 216 217
}

std::unique_ptr<Allocation> AllocatorFacade::Alloc(const platform::Place& place,
                                                   size_t size,
                                                   Allocator::Attr attr) {
S
sneaxiy 已提交
218
  return m_->allocators_.at(place)->Allocate(size, attr);
219 220 221 222 223
}

}  // namespace allocation
}  // namespace memory
}  // namespace paddle