提交 58ed412f 编写于 作者: Y Yu Yang

refactor(memory): rewrite memory allocation and make it extentable

Use OO style to rewrite memory allocation.
上级 643b6faa
......@@ -30,6 +30,8 @@ class ExceptionHolder {
Catch(exp);
} catch (platform::EnforceNotMet exp) {
Catch(exp);
} catch (std::exception& ex) {
LOG(FATAL) << "std::exception caught, " << ex.what();
} catch (...) {
LOG(FATAL) << "Unknown exception caught";
}
......
......@@ -395,11 +395,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
if (!erase_tensors.empty()) gc->Add(erase_tensors);
}
}
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
}
if (gc != nullptr) {
......@@ -421,13 +416,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
scope->DropKids();
}
}
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(2) << "-------------------------------------------------------";
}
}
void Executor::RunPreparedContext(
......
......@@ -111,9 +111,6 @@ class LoDTensor : public Tensor {
public:
LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {}
void set_lod(const LoD& lod) { lod_ = lod; }
......
......@@ -23,6 +23,7 @@
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h"
......@@ -31,46 +32,6 @@ namespace paddle {
namespace framework {
#if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_ != nullptr) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
......@@ -103,8 +64,6 @@ class Vector {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this;
}
......@@ -199,7 +158,7 @@ class Vector {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_);
return reinterpret_cast<T *>(gpu_->ptr());
}
// get cuda ptr. mutable
......@@ -234,13 +193,11 @@ class Vector {
std::mutex &Mutex() const { return mtx_; }
std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
if (gpu_.data_ == nullptr) {
return nullptr;
} else {
return std::unique_ptr<platform::CUDAPlace>(
new platform::CUDAPlace(gpu_.place_));
}
boost::optional<platform::CUDAPlace> CUDAPlace() const {
return gpu_ == nullptr
? boost::none
: boost::optional<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(gpu_->place()));
}
private:
......@@ -254,13 +211,12 @@ class Vector {
void CopyToCPU() const {
// COPY GPU Data To CPU
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(
platform::Place(gpu_.place_)));
platform::DeviceContextPool::Instance().Get(gpu_->place()));
auto stream = dev_ctx->stream();
void *src = gpu_.data_;
void *src = gpu_->ptr();
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
stream);
memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
gpu_->size(), stream);
dev_ctx->Wait();
}
......@@ -277,8 +233,7 @@ class Vector {
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
} else if (IsInCUDA() && !(place == gpu_->place())) {
PADDLE_THROW("This situation should not happen");
// Still dirty
} else {
......@@ -290,7 +245,7 @@ class Vector {
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
} else if (!(place == gpu_->place())) {
PADDLE_THROW("This situation should not happen.");
} else {
// Not Dirty && DataInCUDA && Device is same
......@@ -301,13 +256,13 @@ class Vector {
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T));
void *dst = gpu_->ptr();
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
gpu_->size(), stream);
}
void ImmutableCPU() const {
......@@ -329,7 +284,7 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable std::unique_ptr<memory::Allocation> gpu_;
mutable int flag_;
mutable std::mutex mtx_;
......@@ -428,8 +383,8 @@ class Vector {
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
if (cuda_place == boost::none ||
cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place);
}
}
......@@ -444,8 +399,8 @@ class Vector {
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
if (cuda_place == boost::none ||
cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place);
}
}
......
......@@ -33,9 +33,7 @@ size_t Tensor::memory_size() const {
void* Tensor::mutable_data(platform::Place place, std::type_index type,
size_t requested_size) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
type_ = type;
PADDLE_ENFORCE_GE(numel(), 0,
"When calling this method, the Tensor's numel must be "
"equal or larger than zero. "
......@@ -48,25 +46,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
/* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW(
"CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
}
#else
if (platform::is_gpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type));
} else if (platform::is_cuda_pinned_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
boost::get<platform::CUDAPinnedPlace>(place), size, type));
}
}
#endif
holder_ = memory::AllocShared(place, size);
offset_ = 0;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
......@@ -76,7 +56,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, holder_->type(), requested_size);
return mutable_data(place, type_, requested_size);
}
Tensor& Tensor::ShareDataWith(const Tensor& src) {
......@@ -101,6 +81,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const {
Tensor dst;
dst.holder_ = holder_;
dst.set_layout(layout_);
dst.type_ = type_;
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims);
......
......@@ -67,12 +67,7 @@ class Tensor {
friend struct EigenVector;
public:
Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
Tensor() : type_(typeid(float)), offset_(0) {}
/*! Return a pointer to mutable memory block. */
template <typename T>
......@@ -139,7 +134,7 @@ class Tensor {
std::type_index type() const {
PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor not initialized yet when Tensor::type() is called.");
return holder_->type();
return type_;
}
// memory size returns the holding memory size in byte.
......@@ -154,55 +149,9 @@ class Tensor {
void clear() { holder_ = nullptr; }
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a template
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() = default;
virtual void* ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
};
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size),
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
/*! the place of memory block. */
platform::Place place_;
/*! the size of memory block. */
size_t size_;
/* the current type of memory */
std::type_index type_;
};
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
std::shared_ptr<memory::Allocation> holder_;
std::type_index type_;
/**
* @brief points to elements dimensions.
*
......
......@@ -23,10 +23,10 @@ namespace framework {
template <typename T>
inline const T* Tensor::data() const {
check_memory_size();
bool valid = std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T));
bool valid =
std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
type_.name());
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
......@@ -37,10 +37,10 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
template <typename T>
inline T* Tensor::data() {
check_memory_size();
bool valid = std::is_same<T, void>::value ||
holder_->type() == std::type_index(typeid(T));
bool valid =
std::is_same<T, void>::value || type_ == std::type_index(typeid(T));
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
type_.name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
......
add_subdirectory(detail)
cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
add_subdirectory(allocation)
cc_library(malloc SRCS malloc.cc DEPS allocator_facade)
cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memory
DEPS
malloc
memcpy)
cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
#if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif()
cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator gpu_info)
if (WITH_GPU)
nv_test(best_fit_allocator_test
SRCS best_fit_allocator_test.cc
best_fit_allocator_test.cu
DEPS best_fit_allocator
locked_allocator
cpu_allocator
cuda_allocator
device_context
memcpy)
else()
cc_test(best_fit_allocator_test
SRCS best_fit_allocator_test.cc
DEPS best_fit_allocator
locked_allocator
cpu_allocator)
endif()
cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocator)
cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator)
else ()
set(AllocatorFacadeDeps)
endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
naive_managed_allocator
aligned_allocator)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
ThinAlignedAllocator::ThinAlignedAllocator(
std::shared_ptr<ManagedAllocator> underlyning_allocator)
: underlying_allocator_(std::move(underlyning_allocator)) {}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
template <size_t kAlignment>
class AlignedAllocation : public Allocation {
public:
AlignedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
size_t size)
: Allocation(AlignedPtr(underlying_allocation->ptr()), size,
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)) {}
private:
static void* AlignedPtr(void* ptr) {
auto ptr_addr = reinterpret_cast<uintptr_t>(ptr);
ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment;
return reinterpret_cast<void*>(ptr_addr);
}
std::unique_ptr<Allocation> underlying_allocation_;
};
class ThinAlignedAllocator : public ManagedAllocator {
public:
explicit ThinAlignedAllocator(
std::shared_ptr<ManagedAllocator> underlyning_allocator);
protected:
std::shared_ptr<ManagedAllocator> underlying_allocator_;
};
template <size_t kAlignment>
class AlignedAllocator : public ThinAlignedAllocator {
public:
using ThinAlignedAllocator::ThinAlignedAllocator;
std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
auto raw_allocation =
underlying_allocator_->Allocate(size + kAlignment, attr);
return std::unique_ptr<Allocation>(
new AlignedAllocation<kAlignment>(std::move(raw_allocation), size));
}
std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
return std::shared_ptr<Allocation>(Allocate(size, attr).release());
}
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
Allocation::~Allocation() {}
Allocator::~Allocator() {}
bool Allocator::IsAllocThreadSafe() const { return false; }
const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class BadAlloc : public std::exception {
public:
explicit BadAlloc(const std::string& msg) : msg_(msg) {}
const char* what() const noexcept override;
private:
std::string msg_;
};
class Allocation {
public:
Allocation(void* ptr, size_t size, platform::Place place)
: ptr_(ptr), size_(size), place_(place) {}
Allocation(const Allocation& o) = delete;
Allocation& operator=(const Allocation& o) = delete;
void* ptr() const { return ptr_; }
size_t size() const { return size_; }
const platform::Place& place() const { return place_; }
virtual ~Allocation();
private:
void* ptr_;
size_t size_;
platform::Place place_;
};
class Allocator {
public:
enum Attr {
kDefault = 0,
kTiny = 1,
kFixedHuge = 2,
kFluxHuge = 3,
kTmp = 4,
NumOfAttrs = 5
};
virtual ~Allocator();
virtual std::unique_ptr<Allocation> Allocate(
size_t size, Allocator::Attr attr = kDefault) = 0;
virtual bool IsAllocThreadSafe() const;
};
// User need to invoke `Free` or `FreeUniquePtr` manually if allocated by
// a manally managed allocator.
class UnmanagedAllocator : public Allocator {
public:
virtual void Free(Allocation* allocation) = 0;
void FreeUniquePtr(std::unique_ptr<Allocation> allocation) {
Free(allocation.get());
}
};
// The allocation will be managed by smart pointers
class ManagedAllocator : public Allocator {
public:
virtual std::shared_ptr<Allocation> AllocateShared(
size_t size, Allocator::Attr attr = kDefault) = 0;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/allocator.h"
#include <map>
#include <vector>
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#endif
namespace paddle {
namespace memory {
namespace allocation {
class AllocatorFacadePrivate {
public:
std::map<platform::Place, std::shared_ptr<ManagedAllocator>> allocators_;
std::vector<std::unique_ptr<Allocation>> pre_allocations_;
std::vector<std::shared_ptr<Allocator>> holding_allocators_;
~AllocatorFacadePrivate() {
// Specify destruct order.
pre_allocations_.clear();
allocators_.clear();
holding_allocators_.clear();
}
AllocatorFacadePrivate() {
InitCPUAllocator();
InitCUDAAllocator();
}
private:
void InitCPUAllocator() {
auto all = NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new CPUAllocator()));
allocators_[platform::CPUPlace()] = all;
}
void InitCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
auto cuda_allocator =
NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
new CUDAAllocator(platform::CUDAPlace(dev_id))));
auto allocation = cuda_allocator->Allocate(platform::GpuMaxChunkSize());
auto allocator = NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
new LockedAllocator(std::unique_ptr<Allocator>(
new BestFitAllocator(allocation.get())))));
pre_allocations_.emplace_back(std::move(allocation));
holding_allocators_.emplace_back(cuda_allocator);
allocators_[platform::CUDAPlace(dev_id)] =
std::make_shared<AlignedAllocator<64>>(std::move(allocator));
}
#endif
}
};
AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade::~AllocatorFacade() { delete m_; }
AllocatorFacade& AllocatorFacade::Instance() {
static AllocatorFacade instance;
return instance;
}
std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) {
return m_->allocators_[place]->AllocateShared(size, attr);
}
std::unique_ptr<Allocation> AllocatorFacade::Alloc(const platform::Place& place,
size_t size,
Allocator::Attr attr) {
return m_->allocators_[place]->Allocate(size, attr);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class AllocatorFacadePrivate;
class AllocatorFacade {
public:
~AllocatorFacade();
AllocatorFacade(const AllocatorFacade& o) = delete;
const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
static AllocatorFacade& Instance();
std::shared_ptr<Allocation> AllocShared(
const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
private:
AllocatorFacade();
AllocatorFacadePrivate* m_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include <bits/stdc++.h>
#include <list>
#include <map>
#include <string>
namespace paddle {
namespace memory {
namespace allocation {
static int HighestBitPos(size_t N) {
if (UNLIKELY(N == 0)) {
return 0;
} else {
// NOTE: here we can use __builtin_clz in GCC.
// However, let's use std::log2 for better readability
// and trust std::log2's performance.
return static_cast<int>(std::log2(N) + 1);
}
}
BestFitAllocator::BestFitAllocator(Allocation* allocation)
: allocation_(allocation) {
details::Chunk chunk;
chunk.size_ = allocation_->size();
chunk.offset_ = 0;
chunk.is_free = true;
chunks_.emplace_back(chunk);
free_chunks_[HighestBitPos(chunk.size_)].insert(
{chunk.size_, chunks_.begin()});
}
std::unique_ptr<Allocation> BestFitAllocator::Allocate(size_t size, Attr attr) {
auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
MapIt map_it;
for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
map_it = free_chunks_[highest_set_bit].lower_bound(size);
if (map_it != free_chunks_[highest_set_bit].end()) {
break;
}
}
if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
throw BadAlloc(string::Sprintf(
"Cannot allocate %d, All fragments size is %d", size, FreeSize()));
}
auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
return std::unique_ptr<Allocation>(new BestFitAllocation(this, chunk_it));
}
size_t BestFitAllocator::FreeSize() const {
size_t acc = 0;
for (auto& array_item : free_chunks_) {
for (auto& pair : array_item) {
acc += pair.second->size_;
}
}
return acc;
}
BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size,
size_t free_chunk_offset,
MapIt bin_iterator) {
auto to_split_it = bin_iterator->second;
free_chunks_[free_chunk_offset].erase(bin_iterator);
PADDLE_ENFORCE(to_split_it->is_free);
PADDLE_ENFORCE_GE(to_split_it->size_, request_size);
auto remaining_size = to_split_it->size_ - request_size;
details::Chunk to_use;
details::Chunk remaining;
to_use.size_ = request_size;
to_use.is_free = false;
remaining.size_ = remaining_size;
remaining.is_free = true;
// calc offsets
to_use.offset_ = to_split_it->offset_;
remaining.offset_ = to_use.offset_ + to_use.size_;
// insert to chunk list
auto to_use_it = chunks_.insert(to_split_it, to_use);
if (remaining.size_ != 0) {
auto bit_size = static_cast<size_t>(HighestBitPos(remaining.size_));
free_chunks_[bit_size].insert(
{remaining.size_, chunks_.insert(to_split_it, remaining)});
}
chunks_.erase(to_split_it);
return to_use_it;
}
void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
auto chunk_it = bf_allocation->ChunkIterator();
PADDLE_ENFORCE(!chunk_it->is_free);
chunk_it->is_free = true;
if (chunk_it != chunks_.begin()) {
auto prev_it = chunk_it;
--prev_it;
if (prev_it->is_free) {
// Merge Left.
EraseFreeNode(prev_it);
prev_it->size_ += chunk_it->size_;
chunks_.erase(chunk_it);
chunk_it = prev_it;
}
}
auto next_it = chunk_it;
++next_it;
if (next_it != chunks_.end() && next_it->is_free) {
EraseFreeNode(next_it);
chunk_it->size_ += next_it->size_;
chunks_.erase(next_it);
}
InsertFreeNode(chunk_it);
}
void BestFitAllocator::InsertFreeNode(const ListIt& it) {
auto pos = static_cast<size_t>(HighestBitPos(it->size_));
auto& free_map = free_chunks_[pos];
free_map.insert({it->size_, it});
}
void BestFitAllocator::EraseFreeNode(const ListIt& it) {
size_t pos = static_cast<size_t>(HighestBitPos(it->size_));
auto& free_map = free_chunks_[pos];
auto map_it = free_map.find(it->size_);
while (map_it->second != it && map_it != free_map.end()) {
++map_it;
}
PADDLE_ENFORCE(map_it != free_map.end());
free_map.erase(map_it);
}
size_t BestFitAllocator::NumFreeChunks() const {
size_t num = 0;
for (auto& array_item : free_chunks_) {
num += array_item.size();
}
return num;
}
BestFitAllocation::BestFitAllocation(
paddle::memory::allocation::BestFitAllocator* allocator,
typename details::ChunkList::iterator chunk_it)
: Allocation(reinterpret_cast<void*>(
reinterpret_cast<uintptr_t>(allocator->BasePtr()) +
chunk_it->offset_),
chunk_it->size_, allocator->Place()),
allocator_(allocator),
chunk_it_(chunk_it) {}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include <list>
#include <map>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
namespace details {
struct Chunk {
bool is_free{true};
// Offset to the base allocation.
uintptr_t offset_;
size_t size_;
};
// Here we use std::list to maintain chunk list.
// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next`
// pointers in `Chunk`, and split the allocation as `ChunkHeader` and
// `Payload`. Such as
// *-------*---------------*---------------*--------------*
// | Chunk | prev_ pointer | next_ pointer | payload .... |
// *-------*---------------*---------------*--------------*
// This implementation can just return a raw pointer, and we can get the list
// structure by it. However, we cannot use the same code on GPU since CPU
// cannot access GPU memory directly.
//
// So we choose to use `std::list` and return an allocation instance, which
// contains the list node iterator, then we can unify CPU/GPU code.
//
// To return an allocation is not a bad idea, since Tensor/Vector should holds
// an allocation instead of raw pointer directly.
using ChunkList = std::list<Chunk>;
// Here we use a multi-level map of free chunks.
// the map is
// MSB offset --> size --> [ChunkList::iterator]
//
// The time complexities:
// find a free chunk:
// O(logN),
// where N is the number of free nodes with the same MSB offset.
// find the position of a chunk iterator:
// O(logN + K),
// where N is the number of free nodes with the same MSB offset.
// where K is the number of free nodes with the same size.
// insert a free chunk:
// O(logN),
// where N is the number of free nodes with the same MSB offset.
// erase a free chunk:
// O(1)
using FreeChunkBin =
std::array<std::multimap<size_t, ChunkList::iterator>, sizeof(size_t) * 8>;
} // namespace details
class BestFitAllocator;
// The BestFitAllocation maintain the List Node iterator.
class BestFitAllocation : public Allocation {
private:
using ListIt = typename details::ChunkList::iterator;
public:
BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it);
const ListIt& ChunkIterator() const { return chunk_it_; }
private:
BestFitAllocator* allocator_;
typename details::ChunkList::iterator chunk_it_;
};
// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread
// safe, we must wrap a locked_allocator. However, we can implement a thread
// safe allocator by locking each bin and chunks list independently. It will
// make BestFitAllocator faster in multi-thread situation.
//
// This allocator implements a best-fit allocator with merging the free nodes.
//
// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk
// is larger than request size, the original block will be split into two
// chunks. The first block will be used and the second block will be put into
// free chunks.
//
// To free an allocation, it will set the chunk of allocation to free and merge
// the prev-chunk and the next-chunk when possible.
class BestFitAllocator : public UnmanagedAllocator {
public:
explicit BestFitAllocator(Allocation* allocation);
void* BasePtr() const { return allocation_->ptr(); }
const platform::Place& Place() const { return allocation_->place(); }
std::unique_ptr<Allocation> Allocate(size_t size,
Attr attr = kDefault) override;
void Free(Allocation* allocation) override;
size_t NumFreeChunks() const;
private:
size_t FreeSize() const;
using MapIt = typename details::FreeChunkBin::value_type::iterator;
using ListIt = typename details::ChunkList::iterator;
ListIt SplitChunk(size_t request_size, size_t free_chunk_offset,
MapIt bin_iterator);
void EraseFreeNode(const ListIt& it);
void InsertFreeNode(const ListIt& it);
Allocation* allocation_; // not owned
details::ChunkList chunks_;
details::FreeChunkBin free_chunks_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class StubAllocation : public Allocation {
public:
explicit StubAllocation(size_t size)
: Allocation(0, size, platform::CPUPlace()) {}
};
TEST(BestFitAllocator, test_allocation) {
StubAllocation stub(4UL * 1024 * 1024 * 1024);
BestFitAllocator allocator(&stub);
{
auto allocation = allocator.Allocate(64);
allocator.FreeUniquePtr(std::move(allocation));
}
{
auto allocation = allocator.Allocate(80);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation.get());
ASSERT_NE(best_fit_allocation, nullptr);
ASSERT_FALSE(best_fit_allocation->ChunkIterator()->is_free);
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
ASSERT_EQ(allocation->size(), 80);
ASSERT_EQ(allocation->ptr(), nullptr);
}
auto allocation2 = allocator.Allocate(60);
auto allocation3 = allocator.Allocate(90);
allocator.FreeUniquePtr(std::move(allocation2));
allocation2 = allocator.Allocate(30);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation2.get());
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
}
allocator.FreeUniquePtr(std::move(allocation2));
allocation2 = allocator.Allocate(60);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation2.get());
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
}
allocator.FreeUniquePtr(std::move(allocation));
allocator.FreeUniquePtr(std::move(allocation2));
allocation = allocator.Allocate(80 + 60);
{
auto best_fit_allocation =
dynamic_cast<BestFitAllocation*>(allocation.get());
ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0);
}
allocator.FreeUniquePtr(std::move(allocation));
allocation = allocator.Allocate(80);
allocation2 = allocator.Allocate(60);
allocator.FreeUniquePtr(std::move(allocation));
allocator.FreeUniquePtr(std::move(allocation3));
allocator.FreeUniquePtr(std::move(allocation2));
ASSERT_EQ(allocator.NumFreeChunks(), 1U);
}
}
TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
CPUAllocator allocator;
auto global_allocation = allocator.Allocate(256UL * 1024 * 1024);
std::unique_ptr<Allocator> best_fit_allocator(
new BestFitAllocator(global_allocation.get()));
LockedAllocator locked_allocator(std::move(best_fit_allocator));
auto th_main = [&] {
std::random_device dev;
std::default_random_engine engine(dev());
std::uniform_int_distribution<size_t> dist(1U, 1024U);
for (size_t i = 0; i < 128; ++i) {
size_t allocate_size = dist(engine);
auto allocation =
locked_allocator.Allocate(sizeof(size_t) * allocate_size);
size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
for (size_t j = 0; j < allocate_size; ++j) {
data[j] = j;
}
std::this_thread::yield();
for (size_t j = 0; j < allocate_size; ++j) {
ASSERT_EQ(data[j], j);
}
locked_allocator.FreeUniquePtr(std::move(allocation));
}
};
{
std::vector<std::thread> threads;
for (size_t i = 0; i < 1024; ++i) {
threads.emplace_back(th_main);
}
for (auto& th : threads) {
th.join();
}
}
allocator.FreeUniquePtr(std::move(global_allocation));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace memory {
namespace allocation {
struct ForEachFill {
size_t* ptr_;
explicit ForEachFill(size_t* ptr) : ptr_(ptr) {}
__device__ void operator()(size_t i) { ptr_[i] = i; }
};
TEST(BestFitAllocator, concurrent_cuda) {
CUDAAllocator allocator(platform::CUDAPlace(0));
// 256 MB
auto cuda_allocation = allocator.Allocate(256U * 1024 * 1024);
LockedAllocator concurrent_allocator(
std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
auto th_main = [&] {
std::random_device dev;
std::default_random_engine engine(dev());
std::uniform_int_distribution<size_t> dist(1U, 1024U);
platform::CUDAPlace gpu(0);
platform::CUDADeviceContext dev_ctx(gpu);
std::array<size_t, 1024> buf;
for (size_t i = 0; i < 128; ++i) {
size_t allocate_size = dist(engine);
auto allocation =
concurrent_allocator.Allocate(sizeof(size_t) * allocate_size);
size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
ForEachFill fill(data);
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
allocate_size);
for_range(fill);
memory::Copy(platform::CPUPlace(), buf.data(), gpu, data,
sizeof(size_t) * allocate_size, dev_ctx.stream());
dev_ctx.Wait();
for (size_t j = 0; j < allocate_size; ++j) {
ASSERT_EQ(buf[j], j);
}
concurrent_allocator.FreeUniquePtr(std::move(allocation));
}
};
{
std::vector<std::thread> threads;
for (size_t i = 0; i < 1024; ++i) {
threads.emplace_back(th_main);
}
for (auto& th : threads) {
th.join();
}
}
allocator.FreeUniquePtr(std::move(cuda_allocation));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include <stdlib.h>
#include <string>
namespace paddle {
namespace memory {
namespace allocation {
std::unique_ptr<Allocation> CPUAllocator::Allocate(size_t size, Attr attr) {
void* ptr;
auto status = posix_memalign(&ptr, kAlignment, size);
if (UNLIKELY(status) != 0) {
throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
size, status));
}
return std::unique_ptr<Allocation>(new CPUAllocation(ptr, size));
}
void CPUAllocator::Free(Allocation* allocation) {
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation*>(allocation));
free(allocation->ptr());
}
bool CPUAllocator::IsAllocThreadSafe() const { return true; }
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class CPUAllocation : public Allocation {
public:
CPUAllocation(void* ptr, size_t size)
: Allocation(ptr, size, platform::CPUPlace()) {}
};
class CPUAllocator : public UnmanagedAllocator {
public:
constexpr static size_t kAlignment = 64u;
std::unique_ptr<Allocation> Allocate(size_t size,
Attr attr = kDefault) override;
void Free(Allocation* allocation) override;
bool IsAllocThreadSafe() const override;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <string>
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace memory {
namespace allocation {
class CUDADeviceGuard {
public:
explicit CUDADeviceGuard(int dev_id) {
int prev_id = platform::GetCurrentDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
platform::SetDeviceId(dev_id);
}
}
~CUDADeviceGuard() {
if (prev_id_ != -1) {
platform::SetDeviceId(prev_id_);
}
}
private:
int prev_id_{-1};
};
std::unique_ptr<Allocation> CUDAAllocator::Allocate(size_t size, Attr attr) {
CUDADeviceGuard guard(place_.device);
void* ptr;
auto status = cudaMalloc(&ptr, size);
if (UNLIKELY(status != cudaSuccess)) {
throw BadAlloc(string::Sprintf(
"Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
status, cudaGetErrorString(status)));
}
return std::unique_ptr<Allocation>(
new CUDAAllocation(ptr, size, platform::Place(place_)));
}
void CUDAAllocator::Free(Allocation* allocation) {
auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
place_);
PADDLE_ENFORCE(cudaFree(allocation->ptr()));
}
bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
// Just a flag type.
class CUDAAllocation : public Allocation {
public:
using Allocation::Allocation;
};
class CUDAAllocator : public UnmanagedAllocator {
public:
explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
explicit CUDAAllocator(const platform::Place& place)
: place_(boost::get<platform::CUDAPlace>(place)) {}
std::unique_ptr<Allocation> Allocate(size_t size,
Attr attr = kDefault) override;
void Free(Allocation* allocation) override;
bool IsAllocThreadSafe() const override;
private:
platform::CUDAPlace place_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/locked_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
std::unique_ptr<Allocation> LockedAllocator::Allocate(size_t size, Attr attr) {
if (underlying_allocator_->IsAllocThreadSafe()) {
return underlying_allocator_->Allocate(size, attr);
} else {
std::lock_guard<std::mutex> guard(mtx_);
return underlying_allocator_->Allocate(size, attr);
}
}
void LockedAllocator::Free(Allocation *allocation) {
if (underlying_allocator_->IsAllocThreadSafe()) {
return underlying_allocator_->Free(allocation);
} else {
std::lock_guard<std::mutex> guard(mtx_);
return underlying_allocator_->Free(allocation);
}
}
bool LockedAllocator::IsAllocThreadSafe() const { return true; }
LockedAllocator::LockedAllocator(
std::unique_ptr<Allocator> &&underlying_allocator) {
auto *allocator =
dynamic_cast<UnmanagedAllocator *>(underlying_allocator.get());
PADDLE_ENFORCE_NOT_NULL(allocator);
underlying_allocator.release();
underlying_allocator_.reset(allocator);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <thread> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class LockedAllocator : public UnmanagedAllocator {
public:
explicit LockedAllocator(std::unique_ptr<Allocator>&& underlying_allocator);
std::unique_ptr<Allocation> Allocate(size_t size,
Attr attr = kDefault) override;
void Free(Allocation* allocation) override;
bool IsAllocThreadSafe() const override;
private:
std::unique_ptr<UnmanagedAllocator> underlying_allocator_;
std::mutex mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
NaiveManagedAllocator::NaiveManagedAllocator(
std::unique_ptr<Allocator> &&allocator) {
auto *underlying_allocator =
dynamic_cast<UnmanagedAllocator *>(allocator.get());
PADDLE_ENFORCE_NOT_NULL(underlying_allocator);
allocator.release();
Init(std::unique_ptr<UnmanagedAllocator>(underlying_allocator));
}
NaiveManagedAllocator::NaiveManagedAllocator(
std::unique_ptr<UnmanagedAllocator> &&allocator) {
Init(std::move(allocator));
}
void NaiveManagedAllocator::Init(
std::unique_ptr<UnmanagedAllocator> &&allocator) {
underlying_allocator_ = std::move(allocator);
}
bool NaiveManagedAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
std::unique_ptr<Allocation> NaiveManagedAllocator::Allocate(size_t size,
Attr attr) {
std::unique_ptr<Allocation> allocation =
underlying_allocator_->Allocate(size, attr);
return std::unique_ptr<Allocation>(
new NaiveManagedAllocation(std::move(allocation), shared_from_this()));
}
std::shared_ptr<Allocation> NaiveManagedAllocator::AllocateShared(size_t size,
Attr attr) {
std::unique_ptr<Allocation> allocation =
underlying_allocator_->Allocate(size, attr);
return std::shared_ptr<Allocation>(
new NaiveManagedAllocation(std::move(allocation), shared_from_this()));
}
NaiveManagedAllocation::~NaiveManagedAllocation() {
auto allocator = allocator_.lock();
if (UNLIKELY(allocator == nullptr)) {
// the allocator is destructed before allocations.
// do nothing.
return;
}
// invoke Free
allocator->UnderlyingAllocator().FreeUniquePtr(
std::move(underlying_allocation_));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class NaiveManagedAllocator;
class NaiveManagedAllocation : public Allocation {
public:
NaiveManagedAllocation(std::unique_ptr<Allocation>&& underlying_allocation,
std::shared_ptr<NaiveManagedAllocator> allocator)
: Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)),
allocator_(allocator) {}
~NaiveManagedAllocation() final;
private:
std::unique_ptr<Allocation> underlying_allocation_;
std::weak_ptr<NaiveManagedAllocator> allocator_;
};
class NaiveManagedAllocator
: public ManagedAllocator,
public std::enable_shared_from_this<NaiveManagedAllocator> {
public:
template <typename... ARGS>
static std::shared_ptr<ManagedAllocator> Create(ARGS... args) {
return std::static_pointer_cast<ManagedAllocator>(
std::shared_ptr<NaiveManagedAllocator>(
new NaiveManagedAllocator(std::move(args)...)));
}
inline UnmanagedAllocator& UnderlyingAllocator() {
return *underlying_allocator_;
}
bool IsAllocThreadSafe() const override;
std::unique_ptr<Allocation> Allocate(size_t size,
Attr attr = kDefault) override;
std::shared_ptr<Allocation> AllocateShared(size_t size,
Attr attr = kDefault) override;
private:
explicit NaiveManagedAllocator(std::unique_ptr<Allocator>&& allocator);
explicit NaiveManagedAllocator(
std::unique_ptr<UnmanagedAllocator>&& allocator);
void Init(std::unique_ptr<UnmanagedAllocator>&& allocator);
std::unique_ptr<UnmanagedAllocator> underlying_allocator_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
#include <atomic> // NOLINT
#include <random>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
namespace memory {
namespace allocation {
class StubAllocator : public UnmanagedAllocator {
public:
std::unique_ptr<Allocation> Allocate(size_t size,
Attr attr = kDefault) override {
counter_.fetch_add(1);
return std::unique_ptr<Allocation>(
new Allocation(nullptr, size, platform::CPUPlace()));
}
void Free(Allocation* allocation) override { counter_.fetch_sub(1); }
bool IsAllocThreadSafe() const override { return true; }
std::atomic<int> counter_{0};
};
TEST(NaiveManagedAllocator, main) {
auto allocator = NaiveManagedAllocator::Create(
std::unique_ptr<Allocator>(new StubAllocator()));
auto th_main = [=] {
std::random_device dev;
std::default_random_engine engine(dev());
std::uniform_int_distribution<int> dist(0, 1);
std::vector<std::shared_ptr<Allocation>> allocations;
for (int j = 0; j < 1024; ++j) {
bool to_insert = static_cast<bool>(dist(engine));
if (to_insert) {
allocations.emplace_back(allocator->AllocateShared(10));
} else {
if (!allocations.empty()) {
allocations.pop_back();
}
}
}
};
{
std::vector<std::thread> threads;
for (size_t i = 0; i < 1024; ++i) {
threads.emplace_back(th_main);
}
for (auto& th : threads) {
th.join();
}
}
ASSERT_EQ(reinterpret_cast<StubAllocator&>(
std::dynamic_pointer_cast<NaiveManagedAllocator>(allocator)
->UnderlyingAllocator())
.counter_,
0);
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -14,13 +14,9 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/memory/malloc.h"
#include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by "
......@@ -33,172 +29,14 @@ DECLARE_double(fraction_of_gpu_memory_to_use);
namespace paddle {
namespace memory {
using BuddyAllocator = detail::BuddyAllocator;
BuddyAllocator* GetCPUBuddyAllocator() {
static std::once_flag init_flag;
static detail::BuddyAllocator* a = nullptr;
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
});
return a;
}
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
return p;
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
template <>
size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used();
}
#ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag;
static detail::BuddyAllocator** a_arr = nullptr;
std::call_once(init_flag, [gpu_id]() {
int gpu_num = platform::GetCUDADeviceCount();
PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
gpu_num);
a_arr = new BuddyAllocator*[gpu_num];
for (int i = 0; i < gpu_num; i++) {
a_arr[i] = nullptr;
platform::SetDeviceId(i);
a_arr[i] = new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
});
platform::SetDeviceId(gpu_id);
return a_arr[gpu_id];
}
template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
size_t size, Allocator::Attr attr) {
return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr);
}
template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
platform::SetDeviceId(cur_dev);
}
if (FLAGS_init_allocated_mem) {
cudaMemset(ptr, 0xEF, size);
}
return ptr;
std::unique_ptr<Allocation> Alloc(const platform::Place& place, size_t size,
Allocator::Attr attr) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, attr);
}
template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator* ba = nullptr;
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize());
});
return ba;
}
template <>
size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
return GetCUDAPinnedBuddyAllocator()->Used();
}
template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
size_t size) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
void* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
}
template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
GetCUDAPinnedBuddyAllocator()->Free(p);
}
#endif
size_t Usage::operator()(const platform::CPUPlace& cpu) const {
return Used(cpu);
}
size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
return Used(gpu);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
return Used(cuda_pinned);
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
size_t memory_usage(const platform::Place& p) {
return boost::apply_visitor(Usage(), p);
}
} // namespace memory
} // namespace paddle
......@@ -14,91 +14,21 @@ limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
using allocation::Allocation;
using allocation::Allocator;
/**
* \brief Allocate memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] size Allocation size.
*
* \return Allocated memory block address.
*
* \note If return nullptr, it indicates memory allocation failed
* because insufficient memory in current system. When Alloc
* function is invoked, you must check the returned memory
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] ptr Memory block address to free.
*
*/
template <typename Place>
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
*
* \param[in] place Allocation place (CPU or GPU).
*
*/
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
/**
* \brief Free memory block in one place.
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
extern std::shared_ptr<Allocation> AllocShared(
const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
private:
Place place_;
};
extern std::unique_ptr<Allocation> Alloc(
const platform::Place& place, size_t size,
Allocator::Attr attr = Allocator::kDefault);
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
inline bool is_aligned(void const *p) {
return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
}
size_t align(size_t size, paddle::platform::CPUPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CpuMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, CPUAllocation) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CPUPlace cpu;
p = paddle::memory::Alloc(cpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = cpu;
EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
paddle::memory::Free(cpu, p);
}
TEST(BuddyAllocator, CPUMultAlloc) {
paddle::platform::CPUPlace cpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(cpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(size, cpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(cpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(p.second, cpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
}
#ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, GPUAllocation) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CUDAPlace gpu(0);
p = paddle::memory::Alloc(gpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = gpu;
EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
paddle::memory::Free(gpu, p);
}
TEST(BuddyAllocator, GPUMultAlloc) {
paddle::platform::CUDAPlace gpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(gpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(gpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(gpu) == total_size) continue;
size_t aligned_size = align(size, gpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(gpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(gpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(gpu) == total_size) continue;
size_t aligned_size = align(p.second, gpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(gpu));
}
}
size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, CUDAPinnedAllocator) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CUDAPinnedPlace cpu;
p = paddle::memory::Alloc(cpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = cpu;
EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
paddle::memory::Free(cpu, p);
}
TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
paddle::platform::CUDAPinnedPlace cpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(cpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(size, cpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(cpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(p.second, cpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
}
#endif
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/gather.cu.h"
......@@ -57,22 +58,18 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
// Allocate temporary storage
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
d_temp_storage = memory::Alloc(place, temp_storage_bytes);
auto d_temp_storage =
memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp);
// Run sorting operation
cub::DeviceRadixSort::SortPairsDescending<T, int>(
d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out,
num);
memory::Free(place, d_temp_storage);
d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
idx_out, num);
}
template <typename T>
......@@ -248,11 +245,12 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const T *boxes = proposals.data<T>();
auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
int size_bytes = boxes_num * col_blocks * sizeof(uint64_t);
uint64_t *d_mask =
reinterpret_cast<uint64_t *>(memory::Alloc(place, size_bytes));
auto d_mask_allocation = memory::Alloc(place, size_bytes);
uint64_t *d_mask = reinterpret_cast<uint64_t *>(d_mask_allocation->ptr());
NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes, d_mask);
uint64_t *h_mask = reinterpret_cast<uint64_t *>(
memory::Alloc(platform::CPUPlace(), size_bytes));
auto h_mask_allocation = memory::Alloc(platform::CPUPlace(), size_bytes);
uint64_t *h_mask = reinterpret_cast<uint64_t *>(h_mask_allocation->ptr());
memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0);
std::vector<uint64_t> remv(col_blocks);
......
......@@ -87,13 +87,16 @@ TEST(StridedMemcpy, GPUCrop) {
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
auto src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
framework::DDim src_stride({5, 1});
int dst[4];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
auto dst_allocation = memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());
framework::DDim dst_dim({2, 2});
framework::DDim dst_stride({2, 1});
......@@ -108,9 +111,6 @@ TEST(StridedMemcpy, GPUCrop) {
ASSERT_EQ(2, dst[1]);
ASSERT_EQ(3, dst[2]);
ASSERT_EQ(4, dst[3]);
memory::Free(gpu0, gpu_dst);
memory::Free(gpu0, gpu_src);
}
TEST(StridedMemcpy, GPUConcat) {
......@@ -124,12 +124,13 @@ TEST(StridedMemcpy, GPUConcat) {
platform::CUDAPlace gpu0(0);
platform::CPUPlace cpu;
platform::CUDADeviceContext ctx(gpu0);
int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src));
int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
int dst[8];
int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst));
int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());
framework::DDim src_stride({2, 1});
framework::DDim dst_dim({2, 2});
......@@ -151,9 +152,6 @@ TEST(StridedMemcpy, GPUConcat) {
for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
ASSERT_EQ(expect_dst[i], dst[i]);
}
memory::Free(gpu0, gpu_dst);
memory::Free(gpu0, gpu_src);
}
#endif
......
......@@ -112,11 +112,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
}
void* allocate(size_t num_bytes) const override {
return paddle::memory::Alloc(place_, num_bytes);
auto buf =
paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny);
void* retv = buf->ptr();
allocations_[buf->ptr()] = std::move(buf);
return retv;
}
void deallocate(void* buffer) const override {
paddle::memory::Free(place_, buffer);
allocations_.erase(allocations_.find(buffer));
}
void* scratchpad() const override {
......@@ -143,12 +147,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
const cudaDeviceProp* device_prop_; // not owned;
mutable void* scratch_;
mutable unsigned int* semaphore_;
mutable std::unordered_map<void*, std::unique_ptr<memory::Allocation>>
allocations_;
};
class CudnnHolder {
public:
CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
: workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) {
: workspace_(nullptr), stream_(stream), place_(place) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
}
......@@ -158,36 +164,38 @@ class CudnnHolder {
void RunFunc(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_len) {
std::lock_guard<std::mutex> lock(mtx_);
if (required_workspace_len > workspace_len_) {
if (required_workspace_len > WorkspaceSize()) {
ReallocateWorkspace(required_workspace_len);
}
cudnn_func(workspace_);
cudnn_func(workspace_->ptr());
}
~CudnnHolder() {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
if (workspace_ != nullptr) {
paddle::memory::Free(place_, workspace_);
~CudnnHolder() { PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); }
private:
size_t WorkspaceSize() const {
if (workspace_ == nullptr) {
return 0;
} else {
return workspace_->size();
}
}
private:
void ReallocateWorkspace(size_t required_workspace_len) {
if (required_workspace_len <= workspace_len_) {
if (required_workspace_len <= WorkspaceSize()) {
return;
}
if (workspace_ != nullptr) {
// Maybe someone is using the current workspace
PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
paddle::memory::Free(place_, workspace_);
workspace_.reset();
}
workspace_ = paddle::memory::Alloc(place_, required_workspace_len);
workspace_len_ = required_workspace_len;
workspace_ = paddle::memory::Alloc(place_, required_workspace_len,
memory::Allocator::kFluxHuge);
}
cudnnHandle_t cudnn_handle_;
void* workspace_;
size_t workspace_len_;
std::unique_ptr<memory::Allocation> workspace_;
const cudaStream_t* stream_; // not owned;
const CUDAPlace place_;
......
......@@ -39,7 +39,6 @@ class Multiply {
} // namespace
using paddle::memory::Alloc;
using paddle::memory::Free;
using paddle::memory::Copy;
using paddle::platform::CPUPlace;
......@@ -63,13 +62,13 @@ TEST(Transform, GPUUnary) {
CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
Transform<CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
ctx.Wait();
Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) {
ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
}
......@@ -89,13 +88,13 @@ TEST(Transform, GPUBinary) {
int buf[4] = {1, 2, 3, 4};
CUDAPlace gpu0(0);
CUDADeviceContext ctx(gpu0);
int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
auto gpu_allocation = Alloc(gpu0, sizeof(buf));
int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
Transform<CUDADeviceContext> trans;
trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
ctx.Wait();
Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
Free(gpu0, gpu_buf);
for (int i = 0; i < 4; ++i) {
ASSERT_EQ((i + 1) * (i + 1), buf[i]);
}
......
......@@ -41,4 +41,5 @@ limitations under the License. */
#include <boost/any.hpp>
#include <boost/mpl/comparison.hpp>
#include <boost/mpl/less_equal.hpp>
#include <boost/optional.hpp>
#include <boost/variant.hpp>
......@@ -27,8 +27,7 @@ int main(int argc, char** argv) {
new_argv.push_back(argv[i]);
}
#ifdef PADDLE_WITH_CUDA
new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
new_argv.push_back(strdup("--tryfromenv=fraction_of_gpu_memory_to_use"));
#else
new_argv.push_back(strdup(
"--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
......@@ -37,12 +36,6 @@ int main(int argc, char** argv) {
int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
paddle::memory::Used(paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
paddle::memory::Used(paddle::platform::CUDAPlace(0));
#endif
paddle::framework::InitDevices(true);
return RUN_ALL_TESTS();
}
......@@ -110,10 +110,10 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
"dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope',
'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic',
'eager_delete_tensor_gb'
]
if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册