allocator_facade.cc 10.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/memory/allocation/allocator_facade.h"

17
#include "gflags/gflags.h"
18
#include "paddle/fluid/memory/allocation/allocator.h"
Y
Yu Yang 已提交
19
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
20
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
21
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
22
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
23 24 25
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
S
sneaxiy 已提交
26
#include "paddle/fluid/memory/allocation/retry_allocator.h"
S
sneaxiy 已提交
27
#include "paddle/fluid/platform/enforce.h"
28
#include "paddle/fluid/platform/place.h"
29
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
30
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
S
sneaxiy 已提交
31
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
32
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
S
sneaxiy 已提交
33
#include "paddle/fluid/platform/gpu_info.h"
34
#endif
35
#ifdef PADDLE_WITH_XPU
Q
QingshuChen 已提交
36
#include "paddle/fluid/platform/xpu/xpu_info.h"
37
#endif
38
#include "paddle/fluid/platform/npu_info.h"
39

Z
Zeng Jinle 已提交
40
PADDLE_DEFINE_EXPORTED_int64(
41
    gpu_allocator_retry_time, 10000,
S
sneaxiy 已提交
42 43 44
    "The retry time (milliseconds) when allocator fails "
    "to allocate memory. No retry if this value is not greater than 0");

Z
Zeng Jinle 已提交
45 46 47 48
PADDLE_DEFINE_EXPORTED_bool(
    use_system_allocator, false,
    "Whether to use system allocator to allocate CPU and GPU memory. "
    "Only used for unittests.");
49

50 51 52 53
namespace paddle {
namespace memory {
namespace allocation {

Y
Yu Yang 已提交
54 55
class AllocatorFacadePrivate {
 public:
56 57
  using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;

58
  AllocatorFacadePrivate() {
Z
Zeng Jinle 已提交
59 60
    auto strategy = GetAllocatorStrategy();
    switch (strategy) {
61 62
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
63 64 65 66 67
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
68
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
69 70 71 72 73
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
74 75 76 77 78
#endif
#ifdef PADDLE_WITH_ASCEND_CL
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
79
        InitNaiveBestFitNPUPinnedAllocator();
80
#endif
Z
Zeng Jinle 已提交
81 82
        break;
      }
83 84 85

      case AllocatorStrategy::kAutoGrowth: {
        InitNaiveBestFitCPUAllocator();
86 87 88 89 90
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
91
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
92 93 94 95 96 97
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
Z
Zeng Jinle 已提交
98 99
        break;
      }
100

101 102
      case AllocatorStrategy::kThreadLocal: {
        InitNaiveBestFitCPUAllocator();
103 104 105 106 107
#ifdef PADDLE_WITH_XPU
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
#endif
108
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
109 110 111 112 113 114 115 116 117
        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
             ++dev_id) {
          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
#endif
        break;
      }

Z
Zeng Jinle 已提交
118
      default: {
119 120
        PADDLE_THROW(platform::errors::InvalidArgument(
            "Unsupported allocator strategy: %d", static_cast<int>(strategy)));
Z
Zeng Jinle 已提交
121
      }
Y
Yu Yang 已提交
122
    }
Z
Zeng Jinle 已提交
123
    InitZeroSizeAllocators();
124
    InitSystemAllocators();
125 126 127 128 129 130

    if (FLAGS_gpu_allocator_retry_time > 0) {
      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
    }

    CheckAllocThreadSafe();
Z
Zeng Jinle 已提交
131 132 133 134
  }

  inline const std::shared_ptr<Allocator>& GetAllocator(
      const platform::Place& place, size_t size) {
135 136 137 138
    const auto& allocators =
        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                          : allocators_)
                  : zero_size_allocators_);
Z
Zeng Jinle 已提交
139
    auto iter = allocators.find(place);
140 141 142
    PADDLE_ENFORCE_NE(iter, allocators.end(),
                      platform::errors::NotFound(
                          "No allocator found for the place, %s", place));
Z
Zeng Jinle 已提交
143
    return iter->second;
144 145 146
  }

 private:
147 148
  void InitSystemAllocators() {
    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
149 150 151 152 153 154 155
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::XPUPlace p(i);
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
#endif
156
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
157 158 159 160 161 162 163 164 165 166
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
    int device_count = platform::GetCUDADeviceCount();
    for (int i = 0; i < device_count; ++i) {
      platform::CUDAPlace p(i);
      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
    }
#endif
  }

167 168 169
  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
Y
Yu Yang 已提交
170 171
  }

172
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
173 174 175
  void InitNaiveBestFitCUDAPinnedAllocator() {
    allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
176 177
  }

178 179
  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
180
  }
Y
Yu Yang 已提交
181

182 183 184 185
  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
  }

186 187 188 189
  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
        cuda_allocator, platform::GpuMinChunkSize());
S
sneaxiy 已提交
190
  }
191
#endif
S
sneaxiy 已提交
192

193 194 195 196 197 198
#ifdef PADDLE_WITH_XPU
  void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
#endif

199 200 201 202
#ifdef PADDLE_WITH_ASCEND_CL
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
203 204 205 206 207 208

  void InitNaiveBestFitNPUPinnedAllocator() {
    allocators_[platform::NPUPinnedPlace()] =
        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
  }

209 210
#endif

Z
Zeng Jinle 已提交
211 212 213 214
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}

215 216
    bool IsAllocThreadSafe() const override { return true; }

Z
Zeng Jinle 已提交
217
   protected:
218
    Allocation* AllocateImpl(size_t size) override {
Z
Zeng Jinle 已提交
219 220 221 222 223 224 225 226 227 228 229 230
      return new Allocation(nullptr, 0, place_);
    }

    void FreeImpl(Allocation* allocation) override { delete allocation; }

   private:
    platform::Place place_;
  };

  void InitZeroSizeAllocators() {
    std::vector<platform::Place> places;
    places.emplace_back(platform::CPUPlace());
231
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Z
Zeng Jinle 已提交
232 233 234 235 236 237
    int device_count = platform::GetCUDADeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::CUDAPlace(dev_id));
    }
    places.emplace_back(platform::CUDAPinnedPlace());
#endif
238 239 240 241 242 243
#ifdef PADDLE_WITH_XPU
    int device_count = platform::GetXPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::XPUPlace(dev_id));
    }
#endif
244 245 246 247 248 249
#ifdef PADDLE_WITH_ASCEND_CL
    int device_count = platform::GetNPUDeviceCount();
    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
      places.emplace_back(platform::NPUPlace(dev_id));
    }
#endif
Z
Zeng Jinle 已提交
250 251 252

    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
Y
Yu Yang 已提交
253 254
    }
  }
Z
Zeng Jinle 已提交
255

256 257 258 259 260
  static void CheckAllocThreadSafe(const AllocatorMap& allocators) {
    for (auto& pair : allocators) {
      PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true,
                        platform::errors::InvalidArgument(
                            "Public allocators must be thread safe"));
261
    }
262
  }
263

264 265 266 267
  void CheckAllocThreadSafe() const {
    CheckAllocThreadSafe(allocators_);
    CheckAllocThreadSafe(zero_size_allocators_);
    CheckAllocThreadSafe(system_allocators_);
268 269 270
  }

  void WrapCUDARetryAllocator(size_t retry_time) {
271 272 273 274
    PADDLE_ENFORCE_GT(
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
275 276 277 278 279 280 281
    for (auto& pair : allocators_) {
      if (platform::is_gpu_place(pair.first)) {
        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
      }
    }
  }

Z
Zeng Jinle 已提交
282
 private:
283 284 285
  AllocatorMap allocators_;
  AllocatorMap zero_size_allocators_;
  AllocatorMap system_allocators_;
286 287
};

Y
Refine  
Yu Yang 已提交
288
// Pimpl. Make interface clean.
289
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
290 291 292
// delete m_ may cause core dump when the destructor of python in conflict with
// cpp.
AllocatorFacade::~AllocatorFacade() {}
293 294 295 296 297 298 299

AllocatorFacade& AllocatorFacade::Instance() {
  static AllocatorFacade instance;
  return instance;
}

std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
300 301
    const platform::Place& place, size_t size) {
  return std::shared_ptr<Allocation>(Alloc(place, size));
302 303
}

304 305 306
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                     size_t size) {
  return m_->GetAllocator(place, size)->Allocate(size);
307 308
}

W
Wilber 已提交
309 310
uint64_t AllocatorFacade::Release(const platform::Place& place) {
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
311 312 313
      ->Release(place);
}

314 315 316 317 318
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place) {
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}

319 320 321
}  // namespace allocation
}  // namespace memory
}  // namespace paddle