From 58ed412f68049096421db2fa2c87b045877b81a5 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 28 Sep 2018 11:16:30 +0800 Subject: [PATCH] refactor(memory): rewrite memory allocation and make it extentable Use OO style to rewrite memory allocation. --- .../framework/details/exception_holder.h | 2 + paddle/fluid/framework/executor.cc | 12 -- paddle/fluid/framework/lod_tensor.h | 3 - paddle/fluid/framework/mixed_vector.h | 89 ++------ paddle/fluid/framework/tensor.cc | 27 +-- paddle/fluid/framework/tensor.h | 59 +----- paddle/fluid/framework/tensor_impl.h | 12 +- paddle/fluid/memory/CMakeLists.txt | 7 +- paddle/fluid/memory/allocation/CMakeLists.txt | 43 ++++ .../memory/allocation/aligned_allocator.cc | 26 +++ .../memory/allocation/aligned_allocator.h | 68 ++++++ paddle/fluid/memory/allocation/allocator.cc | 29 +++ paddle/fluid/memory/allocation/allocator.h | 93 ++++++++ .../memory/allocation/allocator_facade.cc | 102 +++++++++ .../memory/allocation/allocator_facade.h | 47 +++++ .../memory/allocation/best_fit_allocator.cc | 169 +++++++++++++++ .../memory/allocation/best_fit_allocator.h | 132 ++++++++++++ .../allocation/best_fit_allocator_test.cc | 144 +++++++++++++ .../allocation/best_fit_allocator_test.cu | 88 ++++++++ .../fluid/memory/allocation/cpu_allocator.cc | 40 ++++ .../fluid/memory/allocation/cpu_allocator.h | 38 ++++ .../fluid/memory/allocation/cuda_allocator.cc | 69 ++++++ .../fluid/memory/allocation/cuda_allocator.h | 45 ++++ .../memory/allocation/locked_allocator.cc | 49 +++++ .../memory/allocation/locked_allocator.h | 38 ++++ .../allocation/naive_managed_allocator.cc | 69 ++++++ .../allocation/naive_managed_allocator.h | 71 +++++++ .../naive_managed_allocator_test.cc | 80 +++++++ paddle/fluid/memory/malloc.cc | 178 +--------------- paddle/fluid/memory/malloc.h | 90 +------- paddle/fluid/memory/malloc_test.cc | 198 ------------------ .../detection/generate_proposals_op.cu | 24 +-- paddle/fluid/operators/strided_memcpy_test.cc | 20 +- paddle/fluid/platform/device_context.cc | 40 ++-- paddle/fluid/platform/transform_test.cu | 9 +- paddle/fluid/platform/variant.h | 1 + paddle/testing/paddle_gtest_main.cc | 9 +- python/paddle/fluid/__init__.py | 8 +- 38 files changed, 1552 insertions(+), 676 deletions(-) create mode 100644 paddle/fluid/memory/allocation/CMakeLists.txt create mode 100644 paddle/fluid/memory/allocation/aligned_allocator.cc create mode 100644 paddle/fluid/memory/allocation/aligned_allocator.h create mode 100644 paddle/fluid/memory/allocation/allocator.cc create mode 100644 paddle/fluid/memory/allocation/allocator.h create mode 100644 paddle/fluid/memory/allocation/allocator_facade.cc create mode 100644 paddle/fluid/memory/allocation/allocator_facade.h create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator.cc create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator.h create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator_test.cc create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator_test.cu create mode 100644 paddle/fluid/memory/allocation/cpu_allocator.cc create mode 100644 paddle/fluid/memory/allocation/cpu_allocator.h create mode 100644 paddle/fluid/memory/allocation/cuda_allocator.cc create mode 100644 paddle/fluid/memory/allocation/cuda_allocator.h create mode 100644 paddle/fluid/memory/allocation/locked_allocator.cc create mode 100644 paddle/fluid/memory/allocation/locked_allocator.h create mode 100644 paddle/fluid/memory/allocation/naive_managed_allocator.cc create mode 100644 paddle/fluid/memory/allocation/naive_managed_allocator.h create mode 100644 paddle/fluid/memory/allocation/naive_managed_allocator_test.cc delete mode 100644 paddle/fluid/memory/malloc_test.cc diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index c97b364de1e..1b1afce04eb 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -30,6 +30,8 @@ class ExceptionHolder { Catch(exp); } catch (platform::EnforceNotMet exp) { Catch(exp); + } catch (std::exception& ex) { + LOG(FATAL) << "std::exception caught, " << ex.what(); } catch (...) { LOG(FATAL) << "Unknown exception caught"; } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8d8042a0563..59389f5c074 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -395,11 +395,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, if (!erase_tensors.empty()) gc->Add(erase_tensors); } } - - if (FLAGS_benchmark) { - VLOG(2) << "Memory used after operator " + op->Type() + " running: " - << memory::memory_usage(place_); - } } if (gc != nullptr) { @@ -421,13 +416,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, scope->DropKids(); } } - - if (FLAGS_benchmark) { - VLOG(2) << "-------------------------------------------------------"; - VLOG(2) << "Memory used after deleting local scope: " - << memory::memory_usage(place_); - VLOG(2) << "-------------------------------------------------------"; - } } void Executor::RunPreparedContext( diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index e9b473d5472..fb6e781fd07 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -111,9 +111,6 @@ class LoDTensor : public Tensor { public: LoDTensor() : Tensor() {} - /* Constructor with place should only be used in pybind */ - explicit LoDTensor(const platform::Place& place) : Tensor(place) {} - explicit LoDTensor(const LoD& lod) : lod_(lod) {} void set_lod(const LoD& lod) { lod_ = lod; } diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 77386f4f069..cbaa80dffac 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" #include "glog/logging.h" @@ -31,46 +32,6 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) -namespace details { -struct CUDABuffer { - void *data_{nullptr}; - size_t size_{0}; - platform::CUDAPlace place_; - - CUDABuffer() {} - CUDABuffer(platform::Place place, size_t size) - : size_(size), place_(boost::get(place)) { - data_ = memory::Alloc(place_, size); - } - - ~CUDABuffer() { ClearMemory(); } - - CUDABuffer(const CUDABuffer &o) = delete; - CUDABuffer &operator=(const CUDABuffer &o) = delete; - - void Resize(platform::Place place, size_t size) { - ClearMemory(); - place_ = boost::get(place); - data_ = memory::Alloc(place_, size); - PADDLE_ENFORCE_NOT_NULL(data_); - size_ = size; - } - - void Swap(CUDABuffer &o) { - std::swap(data_, o.data_); - std::swap(place_, o.place_); - std::swap(size_, o.size_); - } - - private: - void ClearMemory() const { - if (data_ != nullptr) { - memory::Free(place_, data_); - } - } -}; -} // namespace details - // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template @@ -103,8 +64,6 @@ class Vector { o.ImmutableCPU(); cpu_ = o.cpu_; flag_ = kDataInCPU; - details::CUDABuffer null; - gpu_.Swap(null); return *this; } @@ -199,7 +158,7 @@ class Vector { PADDLE_ENFORCE(platform::is_gpu_place(place), "CUDA Data must on CUDA place"); ImmutableCUDA(place); - return reinterpret_cast(gpu_.data_); + return reinterpret_cast(gpu_->ptr()); } // get cuda ptr. mutable @@ -234,13 +193,11 @@ class Vector { std::mutex &Mutex() const { return mtx_; } - std::unique_ptr CUDAPlace() const { - if (gpu_.data_ == nullptr) { - return nullptr; - } else { - return std::unique_ptr( - new platform::CUDAPlace(gpu_.place_)); - } + boost::optional CUDAPlace() const { + return gpu_ == nullptr + ? boost::none + : boost::optional( + boost::get(gpu_->place())); } private: @@ -254,13 +211,12 @@ class Vector { void CopyToCPU() const { // COPY GPU Data To CPU auto *dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get( - platform::Place(gpu_.place_))); + platform::DeviceContextPool::Instance().Get(gpu_->place())); auto stream = dev_ctx->stream(); - void *src = gpu_.data_; + void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, - stream); + memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -277,8 +233,7 @@ class Vector { CopyCPUDataToCUDA(place); UnsetFlag(kDirty); SetFlag(kDataInCUDA); - } else if (IsInCUDA() && - !(boost::get(place) == gpu_.place_)) { + } else if (IsInCUDA() && !(place == gpu_->place())) { PADDLE_THROW("This situation should not happen"); // Still dirty } else { @@ -290,7 +245,7 @@ class Vector { // Even data is not dirty. However, data is not in CUDA. Copy data. CopyCPUDataToCUDA(place); SetFlag(kDataInCUDA); - } else if (!(boost::get(place) == gpu_.place_)) { + } else if (!(place == gpu_->place())) { PADDLE_THROW("This situation should not happen."); } else { // Not Dirty && DataInCUDA && Device is same @@ -301,13 +256,13 @@ class Vector { void CopyCPUDataToCUDA(const platform::Place &place) const { void *src = cpu_.data(); - gpu_.Resize(place, cpu_.size() * sizeof(T)); - void *dst = gpu_.data_; + gpu_ = memory::Alloc(place, cpu_.size() * sizeof(T)); + void *dst = gpu_->ptr(); auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, - stream); + memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -329,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable details::CUDABuffer gpu_; + mutable std::unique_ptr gpu_; mutable int flag_; mutable std::mutex mtx_; @@ -428,8 +383,8 @@ class Vector { auto &mtx = m_.Data().Mutex(); std::lock_guard guard(mtx); auto cuda_place = m_.Data().CUDAPlace(); - if (cuda_place == nullptr || - *cuda_place == boost::get(place)) { + if (cuda_place == boost::none || + cuda_place == boost::get(place)) { return m_.Data().CUDAData(place); } } @@ -444,8 +399,8 @@ class Vector { auto &mtx = m_.Data().Mutex(); std::lock_guard guard(mtx); auto cuda_place = m_.Data().CUDAPlace(); - if (cuda_place == nullptr || - *cuda_place == boost::get(place)) { + if (cuda_place == boost::none || + cuda_place == boost::get(place)) { return m_.MutableData()->CUDAMutableData(place); } } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index b6ba0df033a..48d300eba95 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -33,9 +33,7 @@ size_t Tensor::memory_size() const { void* Tensor::mutable_data(platform::Place place, std::type_index type, size_t requested_size) { - if (holder_ != nullptr) { - holder_->set_type(type); - } + type_ = type; PADDLE_ENFORCE_GE(numel(), 0, "When calling this method, the Tensor's numel must be " "equal or larger than zero. " @@ -48,25 +46,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + offset_) { - if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } else if (platform::is_gpu_place(place) || - platform::is_cuda_pinned_place(place)) { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW( - "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode."); - } -#else - if (platform::is_gpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } else if (platform::is_cuda_pinned_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); - } - } -#endif + holder_ = memory::AllocShared(place, size); offset_ = 0; } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + @@ -76,7 +56,7 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type, void* Tensor::mutable_data(platform::Place place, size_t requested_size) { PADDLE_ENFORCE(this->holder_ != nullptr, "Cannot invoke mutable data if current hold nothing."); - return mutable_data(place, holder_->type(), requested_size); + return mutable_data(place, type_, requested_size); } Tensor& Tensor::ShareDataWith(const Tensor& src) { @@ -101,6 +81,7 @@ Tensor Tensor::Slice(int begin_idx, int end_idx) const { Tensor dst; dst.holder_ = holder_; dst.set_layout(layout_); + dst.type_ = type_; DDim dst_dims = dims_; dst_dims[0] = end_idx - begin_idx; dst.Resize(dst_dims); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index f1d26854857..232b5a67a0a 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -67,12 +67,7 @@ class Tensor { friend struct EigenVector; public: - Tensor() : offset_(0) {} - - /*! Constructor with place should only be used in pybind. */ - explicit Tensor(const platform::Place& place) : offset_(0) { - holder_->set_place(place); - } + Tensor() : type_(typeid(float)), offset_(0) {} /*! Return a pointer to mutable memory block. */ template @@ -139,7 +134,7 @@ class Tensor { std::type_index type() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor not initialized yet when Tensor::type() is called."); - return holder_->type(); + return type_; } // memory size returns the holding memory size in byte. @@ -154,55 +149,9 @@ class Tensor { void clear() { holder_ = nullptr; } private: - /** - * @note Placeholder hides type T, so it doesn't appear as a template - * parameter of Variable. - */ - struct Placeholder { - virtual ~Placeholder() = default; - virtual void* ptr() const = 0; - virtual size_t size() const = 0; - virtual std::type_index type() const = 0; - virtual platform::Place place() const = 0; - virtual void set_type(std::type_index type) = 0; - virtual void set_place(platform::Place place) = 0; - }; - - template - struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place place, size_t size, std::type_index type) - : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), - place_(place), - size_(size), - type_(type) { - PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", - (is_cpu_place(place_) ? "CPU" : "GPU")); - } - - virtual size_t size() const { return size_; } - virtual platform::Place place() const { return place_; } - virtual void* ptr() const { return static_cast(ptr_.get()); } - virtual std::type_index type() const { return type_; } - virtual void set_type(std::type_index type) { type_ = type; } - virtual void set_place(platform::Place place) { place_ = place; } - - /*! the pointer of memory block. */ - std::unique_ptr> ptr_; - - /*! the place of memory block. */ - platform::Place place_; - - /*! the size of memory block. */ - size_t size_; - - /* the current type of memory */ - std::type_index type_; - }; - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - + std::shared_ptr holder_; + std::type_index type_; /** * @brief points to elements dimensions. * diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 6d3047c95d6..dfa251c02da 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -23,10 +23,10 @@ namespace framework { template inline const T* Tensor::data() const { check_memory_size(); - bool valid = std::is_same::value || - holder_->type() == std::type_index(typeid(T)); + bool valid = + std::is_same::value || type_ == std::type_index(typeid(T)); PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - this->holder_->type().name()); + type_.name()); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); @@ -37,10 +37,10 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } template inline T* Tensor::data() { check_memory_size(); - bool valid = std::is_same::value || - holder_->type() == std::type_index(typeid(T)); + bool valid = + std::is_same::value || type_ == std::type_index(typeid(T)); PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - this->holder_->type().name()); + type_.name()); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 709fc7e12e1..bdf8325d150 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,15 +1,12 @@ add_subdirectory(detail) - -cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce) +add_subdirectory(allocation) +cc_library(malloc SRCS malloc.cc DEPS allocator_facade) cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memory DEPS malloc memcpy) - -cc_test(malloc_test SRCS malloc_test.cc DEPS malloc) - #if (WITH_GPU) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) #endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt new file mode 100644 index 00000000000..a932b164401 --- /dev/null +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -0,0 +1,43 @@ +cc_library(allocator SRCS allocator.cc DEPS place) +cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) +cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) +cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) +nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator gpu_info) + +if (WITH_GPU) + nv_test(best_fit_allocator_test + SRCS best_fit_allocator_test.cc + best_fit_allocator_test.cu + DEPS best_fit_allocator + locked_allocator + cpu_allocator + cuda_allocator + device_context + memcpy) +else() + cc_test(best_fit_allocator_test + SRCS best_fit_allocator_test.cc + DEPS best_fit_allocator + locked_allocator + cpu_allocator) +endif() + + +cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocator) +cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator) + +if (WITH_GPU) + set(AllocatorFacadeDeps gpu_info cuda_allocator) +else () + set(AllocatorFacadeDeps) +endif() + +cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) + +cc_library(allocator_facade SRCS allocator_facade.cc DEPS + ${AllocatorFacadeDeps} + cpu_allocator + locked_allocator + best_fit_allocator + naive_managed_allocator + aligned_allocator) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc new file mode 100644 index 00000000000..a805e19bc9f --- /dev/null +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/aligned_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +ThinAlignedAllocator::ThinAlignedAllocator( + std::shared_ptr underlyning_allocator) + : underlying_allocator_(std::move(underlyning_allocator)) {} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h new file mode 100644 index 00000000000..d9eb7870c9b --- /dev/null +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -0,0 +1,68 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +template +class AlignedAllocation : public Allocation { + public: + AlignedAllocation(std::unique_ptr&& underlying_allocation, + size_t size) + : Allocation(AlignedPtr(underlying_allocation->ptr()), size, + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)) {} + + private: + static void* AlignedPtr(void* ptr) { + auto ptr_addr = reinterpret_cast(ptr); + ptr_addr = (ptr_addr & ~(kAlignment - 1)) + kAlignment; + return reinterpret_cast(ptr_addr); + } + + std::unique_ptr underlying_allocation_; +}; + +class ThinAlignedAllocator : public ManagedAllocator { + public: + explicit ThinAlignedAllocator( + std::shared_ptr underlyning_allocator); + + protected: + std::shared_ptr underlying_allocator_; +}; + +template +class AlignedAllocator : public ThinAlignedAllocator { + public: + using ThinAlignedAllocator::ThinAlignedAllocator; + std::unique_ptr Allocate(size_t size, Attr attr) override { + auto raw_allocation = + underlying_allocator_->Allocate(size + kAlignment, attr); + return std::unique_ptr( + new AlignedAllocation(std::move(raw_allocation), size)); + } + std::shared_ptr AllocateShared(size_t size, Attr attr) override { + return std::shared_ptr(Allocate(size, attr).release()); + } +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc new file mode 100644 index 00000000000..8833b4e1cd6 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator.h" +namespace paddle { +namespace memory { +namespace allocation { +Allocation::~Allocation() {} + +Allocator::~Allocator() {} + +bool Allocator::IsAllocThreadSafe() const { return false; } + +const char* BadAlloc::what() const noexcept { return msg_.c_str(); } + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h new file mode 100644 index 00000000000..500fc28645b --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class BadAlloc : public std::exception { + public: + explicit BadAlloc(const std::string& msg) : msg_(msg) {} + const char* what() const noexcept override; + + private: + std::string msg_; +}; + +class Allocation { + public: + Allocation(void* ptr, size_t size, platform::Place place) + : ptr_(ptr), size_(size), place_(place) {} + + Allocation(const Allocation& o) = delete; + Allocation& operator=(const Allocation& o) = delete; + + void* ptr() const { return ptr_; } + + size_t size() const { return size_; } + + const platform::Place& place() const { return place_; } + + virtual ~Allocation(); + + private: + void* ptr_; + size_t size_; + platform::Place place_; +}; + +class Allocator { + public: + enum Attr { + kDefault = 0, + kTiny = 1, + kFixedHuge = 2, + kFluxHuge = 3, + kTmp = 4, + NumOfAttrs = 5 + }; + + virtual ~Allocator(); + virtual std::unique_ptr Allocate( + size_t size, Allocator::Attr attr = kDefault) = 0; + + virtual bool IsAllocThreadSafe() const; +}; + +// User need to invoke `Free` or `FreeUniquePtr` manually if allocated by +// a manally managed allocator. +class UnmanagedAllocator : public Allocator { + public: + virtual void Free(Allocation* allocation) = 0; + + void FreeUniquePtr(std::unique_ptr allocation) { + Free(allocation.get()); + } +}; + +// The allocation will be managed by smart pointers +class ManagedAllocator : public Allocator { + public: + virtual std::shared_ptr AllocateShared( + size_t size, Allocator::Attr attr = kDefault) = 0; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc new file mode 100644 index 00000000000..fc508e75f1c --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator.h" +#include +#include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" +#include "paddle/fluid/memory/allocation/naive_managed_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#endif + +namespace paddle { +namespace memory { +namespace allocation { + +class AllocatorFacadePrivate { + public: + std::map> allocators_; + std::vector> pre_allocations_; + std::vector> holding_allocators_; + + ~AllocatorFacadePrivate() { + // Specify destruct order. + pre_allocations_.clear(); + allocators_.clear(); + holding_allocators_.clear(); + } + + AllocatorFacadePrivate() { + InitCPUAllocator(); + InitCUDAAllocator(); + } + + private: + void InitCPUAllocator() { + auto all = NaiveManagedAllocator::Create( + std::unique_ptr(new CPUAllocator())); + + allocators_[platform::CPUPlace()] = all; + } + + void InitCUDAAllocator() { +#ifdef PADDLE_WITH_CUDA + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { + auto cuda_allocator = + NaiveManagedAllocator::Create(std::unique_ptr( + new CUDAAllocator(platform::CUDAPlace(dev_id)))); + + auto allocation = cuda_allocator->Allocate(platform::GpuMaxChunkSize()); + auto allocator = NaiveManagedAllocator::Create(std::unique_ptr( + new LockedAllocator(std::unique_ptr( + new BestFitAllocator(allocation.get()))))); + + pre_allocations_.emplace_back(std::move(allocation)); + holding_allocators_.emplace_back(cuda_allocator); + allocators_[platform::CUDAPlace(dev_id)] = + std::make_shared>(std::move(allocator)); + } +#endif + } +}; + +AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} +AllocatorFacade::~AllocatorFacade() { delete m_; } + +AllocatorFacade& AllocatorFacade::Instance() { + static AllocatorFacade instance; + return instance; +} + +std::shared_ptr AllocatorFacade::AllocShared( + const platform::Place& place, size_t size, Allocator::Attr attr) { + return m_->allocators_[place]->AllocateShared(size, attr); +} + +std::unique_ptr AllocatorFacade::Alloc(const platform::Place& place, + size_t size, + Allocator::Attr attr) { + return m_->allocators_[place]->Allocate(size, attr); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h new file mode 100644 index 00000000000..d780fb6e64b --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class AllocatorFacadePrivate; +class AllocatorFacade { + public: + ~AllocatorFacade(); + AllocatorFacade(const AllocatorFacade& o) = delete; + const AllocatorFacade& operator=(const AllocatorFacade& o) = delete; + + static AllocatorFacade& Instance(); + + std::shared_ptr AllocShared( + const platform::Place& place, size_t size, + Allocator::Attr attr = Allocator::kDefault); + + std::unique_ptr Alloc(const platform::Place& place, size_t size, + Allocator::Attr attr = Allocator::kDefault); + + private: + AllocatorFacade(); + AllocatorFacadePrivate* m_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc new file mode 100644 index 00000000000..aa338f46756 --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include +#include +#include +#include + +namespace paddle { +namespace memory { +namespace allocation { + +static int HighestBitPos(size_t N) { + if (UNLIKELY(N == 0)) { + return 0; + } else { + // NOTE: here we can use __builtin_clz in GCC. + // However, let's use std::log2 for better readability + // and trust std::log2's performance. + return static_cast(std::log2(N) + 1); + } +} + +BestFitAllocator::BestFitAllocator(Allocation* allocation) + : allocation_(allocation) { + details::Chunk chunk; + chunk.size_ = allocation_->size(); + chunk.offset_ = 0; + chunk.is_free = true; + chunks_.emplace_back(chunk); + free_chunks_[HighestBitPos(chunk.size_)].insert( + {chunk.size_, chunks_.begin()}); +} + +std::unique_ptr BestFitAllocator::Allocate(size_t size, Attr attr) { + auto highest_set_bit = static_cast(HighestBitPos(size)); + MapIt map_it; + for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) { + map_it = free_chunks_[highest_set_bit].lower_bound(size); + if (map_it != free_chunks_[highest_set_bit].end()) { + break; + } + } + if (UNLIKELY(highest_set_bit == free_chunks_.size())) { + throw BadAlloc(string::Sprintf( + "Cannot allocate %d, All fragments size is %d", size, FreeSize())); + } + auto chunk_it = SplitChunk(size, highest_set_bit, map_it); + return std::unique_ptr(new BestFitAllocation(this, chunk_it)); +} + +size_t BestFitAllocator::FreeSize() const { + size_t acc = 0; + for (auto& array_item : free_chunks_) { + for (auto& pair : array_item) { + acc += pair.second->size_; + } + } + return acc; +} + +BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size, + size_t free_chunk_offset, + MapIt bin_iterator) { + auto to_split_it = bin_iterator->second; + free_chunks_[free_chunk_offset].erase(bin_iterator); + + PADDLE_ENFORCE(to_split_it->is_free); + PADDLE_ENFORCE_GE(to_split_it->size_, request_size); + + auto remaining_size = to_split_it->size_ - request_size; + details::Chunk to_use; + details::Chunk remaining; + to_use.size_ = request_size; + to_use.is_free = false; + remaining.size_ = remaining_size; + remaining.is_free = true; + + // calc offsets + to_use.offset_ = to_split_it->offset_; + remaining.offset_ = to_use.offset_ + to_use.size_; + + // insert to chunk list + auto to_use_it = chunks_.insert(to_split_it, to_use); + if (remaining.size_ != 0) { + auto bit_size = static_cast(HighestBitPos(remaining.size_)); + free_chunks_[bit_size].insert( + {remaining.size_, chunks_.insert(to_split_it, remaining)}); + } + chunks_.erase(to_split_it); + return to_use_it; +} + +void BestFitAllocator::Free(Allocation* allocation) { + auto* bf_allocation = dynamic_cast(allocation); + auto chunk_it = bf_allocation->ChunkIterator(); + PADDLE_ENFORCE(!chunk_it->is_free); + chunk_it->is_free = true; + if (chunk_it != chunks_.begin()) { + auto prev_it = chunk_it; + --prev_it; + + if (prev_it->is_free) { + // Merge Left. + EraseFreeNode(prev_it); + prev_it->size_ += chunk_it->size_; + chunks_.erase(chunk_it); + chunk_it = prev_it; + } + } + + auto next_it = chunk_it; + ++next_it; + if (next_it != chunks_.end() && next_it->is_free) { + EraseFreeNode(next_it); + chunk_it->size_ += next_it->size_; + chunks_.erase(next_it); + } + + InsertFreeNode(chunk_it); +} + +void BestFitAllocator::InsertFreeNode(const ListIt& it) { + auto pos = static_cast(HighestBitPos(it->size_)); + auto& free_map = free_chunks_[pos]; + free_map.insert({it->size_, it}); +} +void BestFitAllocator::EraseFreeNode(const ListIt& it) { + size_t pos = static_cast(HighestBitPos(it->size_)); + auto& free_map = free_chunks_[pos]; + auto map_it = free_map.find(it->size_); + while (map_it->second != it && map_it != free_map.end()) { + ++map_it; + } + PADDLE_ENFORCE(map_it != free_map.end()); + free_map.erase(map_it); +} +size_t BestFitAllocator::NumFreeChunks() const { + size_t num = 0; + for (auto& array_item : free_chunks_) { + num += array_item.size(); + } + return num; +} + +BestFitAllocation::BestFitAllocation( + paddle::memory::allocation::BestFitAllocator* allocator, + typename details::ChunkList::iterator chunk_it) + : Allocation(reinterpret_cast( + reinterpret_cast(allocator->BasePtr()) + + chunk_it->offset_), + chunk_it->size_, allocator->Place()), + allocator_(allocator), + chunk_it_(chunk_it) {} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h new file mode 100644 index 00000000000..309a2a77088 --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -0,0 +1,132 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { +namespace details { +struct Chunk { + bool is_free{true}; + // Offset to the base allocation. + uintptr_t offset_; + size_t size_; +}; + +// Here we use std::list to maintain chunk list. +// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next` +// pointers in `Chunk`, and split the allocation as `ChunkHeader` and +// `Payload`. Such as +// *-------*---------------*---------------*--------------* +// | Chunk | prev_ pointer | next_ pointer | payload .... | +// *-------*---------------*---------------*--------------* +// This implementation can just return a raw pointer, and we can get the list +// structure by it. However, we cannot use the same code on GPU since CPU +// cannot access GPU memory directly. +// +// So we choose to use `std::list` and return an allocation instance, which +// contains the list node iterator, then we can unify CPU/GPU code. +// +// To return an allocation is not a bad idea, since Tensor/Vector should holds +// an allocation instead of raw pointer directly. +using ChunkList = std::list; + +// Here we use a multi-level map of free chunks. +// the map is +// MSB offset --> size --> [ChunkList::iterator] +// +// The time complexities: +// find a free chunk: +// O(logN), +// where N is the number of free nodes with the same MSB offset. +// find the position of a chunk iterator: +// O(logN + K), +// where N is the number of free nodes with the same MSB offset. +// where K is the number of free nodes with the same size. +// insert a free chunk: +// O(logN), +// where N is the number of free nodes with the same MSB offset. +// erase a free chunk: +// O(1) +using FreeChunkBin = + std::array, sizeof(size_t) * 8>; +} // namespace details + +class BestFitAllocator; + +// The BestFitAllocation maintain the List Node iterator. +class BestFitAllocation : public Allocation { + private: + using ListIt = typename details::ChunkList::iterator; + + public: + BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it); + + const ListIt& ChunkIterator() const { return chunk_it_; } + + private: + BestFitAllocator* allocator_; + typename details::ChunkList::iterator chunk_it_; +}; + +// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread +// safe, we must wrap a locked_allocator. However, we can implement a thread +// safe allocator by locking each bin and chunks list independently. It will +// make BestFitAllocator faster in multi-thread situation. +// +// This allocator implements a best-fit allocator with merging the free nodes. +// +// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk +// is larger than request size, the original block will be split into two +// chunks. The first block will be used and the second block will be put into +// free chunks. +// +// To free an allocation, it will set the chunk of allocation to free and merge +// the prev-chunk and the next-chunk when possible. +class BestFitAllocator : public UnmanagedAllocator { + public: + explicit BestFitAllocator(Allocation* allocation); + + void* BasePtr() const { return allocation_->ptr(); } + + const platform::Place& Place() const { return allocation_->place(); } + + std::unique_ptr Allocate(size_t size, + Attr attr = kDefault) override; + void Free(Allocation* allocation) override; + + size_t NumFreeChunks() const; + + private: + size_t FreeSize() const; + using MapIt = typename details::FreeChunkBin::value_type::iterator; + using ListIt = typename details::ChunkList::iterator; + + ListIt SplitChunk(size_t request_size, size_t free_chunk_offset, + MapIt bin_iterator); + void EraseFreeNode(const ListIt& it); + void InsertFreeNode(const ListIt& it); + + Allocation* allocation_; // not owned + details::ChunkList chunks_; + details::FreeChunkBin free_chunks_; +}; +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc new file mode 100644 index 00000000000..9af903a128d --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include // NOLINT +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class StubAllocation : public Allocation { + public: + explicit StubAllocation(size_t size) + : Allocation(0, size, platform::CPUPlace()) {} +}; + +TEST(BestFitAllocator, test_allocation) { + StubAllocation stub(4UL * 1024 * 1024 * 1024); + BestFitAllocator allocator(&stub); + { + auto allocation = allocator.Allocate(64); + allocator.FreeUniquePtr(std::move(allocation)); + } + + { + auto allocation = allocator.Allocate(80); + + { + auto best_fit_allocation = + dynamic_cast(allocation.get()); + ASSERT_NE(best_fit_allocation, nullptr); + ASSERT_FALSE(best_fit_allocation->ChunkIterator()->is_free); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0); + ASSERT_EQ(allocation->size(), 80); + ASSERT_EQ(allocation->ptr(), nullptr); + } + + auto allocation2 = allocator.Allocate(60); + auto allocation3 = allocator.Allocate(90); + allocator.FreeUniquePtr(std::move(allocation2)); + allocation2 = allocator.Allocate(30); + + { + auto best_fit_allocation = + dynamic_cast(allocation2.get()); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80); + } + allocator.FreeUniquePtr(std::move(allocation2)); + + allocation2 = allocator.Allocate(60); + + { + auto best_fit_allocation = + dynamic_cast(allocation2.get()); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80); + } + + allocator.FreeUniquePtr(std::move(allocation)); + allocator.FreeUniquePtr(std::move(allocation2)); + + allocation = allocator.Allocate(80 + 60); + { + auto best_fit_allocation = + dynamic_cast(allocation.get()); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0); + } + + allocator.FreeUniquePtr(std::move(allocation)); + + allocation = allocator.Allocate(80); + allocation2 = allocator.Allocate(60); + allocator.FreeUniquePtr(std::move(allocation)); + allocator.FreeUniquePtr(std::move(allocation3)); + allocator.FreeUniquePtr(std::move(allocation2)); + + ASSERT_EQ(allocator.NumFreeChunks(), 1U); + } +} + +TEST(BestFitAllocator, test_concurrent_cpu_allocation) { + CPUAllocator allocator; + auto global_allocation = allocator.Allocate(256UL * 1024 * 1024); + + std::unique_ptr best_fit_allocator( + new BestFitAllocator(global_allocation.get())); + + LockedAllocator locked_allocator(std::move(best_fit_allocator)); + + auto th_main = [&] { + std::random_device dev; + std::default_random_engine engine(dev()); + std::uniform_int_distribution dist(1U, 1024U); + + for (size_t i = 0; i < 128; ++i) { + size_t allocate_size = dist(engine); + + auto allocation = + locked_allocator.Allocate(sizeof(size_t) * allocate_size); + + size_t* data = reinterpret_cast(allocation->ptr()); + + for (size_t j = 0; j < allocate_size; ++j) { + data[j] = j; + } + std::this_thread::yield(); + + for (size_t j = 0; j < allocate_size; ++j) { + ASSERT_EQ(data[j], j); + } + + locked_allocator.FreeUniquePtr(std::move(allocation)); + } + }; + { + std::vector threads; + for (size_t i = 0; i < 1024; ++i) { + threads.emplace_back(th_main); + } + for (auto& th : threads) { + th.join(); + } + } + + allocator.FreeUniquePtr(std::move(global_allocation)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu new file mode 100644 index 00000000000..a3dcb8b2aef --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu @@ -0,0 +1,88 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // NOLINT +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/for_range.h" +namespace paddle { +namespace memory { +namespace allocation { + +struct ForEachFill { + size_t* ptr_; + + explicit ForEachFill(size_t* ptr) : ptr_(ptr) {} + + __device__ void operator()(size_t i) { ptr_[i] = i; } +}; + +TEST(BestFitAllocator, concurrent_cuda) { + CUDAAllocator allocator(platform::CUDAPlace(0)); + // 256 MB + auto cuda_allocation = allocator.Allocate(256U * 1024 * 1024); + LockedAllocator concurrent_allocator( + std::unique_ptr(new BestFitAllocator(cuda_allocation.get()))); + + auto th_main = [&] { + std::random_device dev; + std::default_random_engine engine(dev()); + std::uniform_int_distribution dist(1U, 1024U); + platform::CUDAPlace gpu(0); + platform::CUDADeviceContext dev_ctx(gpu); + std::array buf; + for (size_t i = 0; i < 128; ++i) { + size_t allocate_size = dist(engine); + + auto allocation = + concurrent_allocator.Allocate(sizeof(size_t) * allocate_size); + + size_t* data = reinterpret_cast(allocation->ptr()); + + ForEachFill fill(data); + platform::ForRange for_range(dev_ctx, + allocate_size); + for_range(fill); + + memory::Copy(platform::CPUPlace(), buf.data(), gpu, data, + sizeof(size_t) * allocate_size, dev_ctx.stream()); + + dev_ctx.Wait(); + for (size_t j = 0; j < allocate_size; ++j) { + ASSERT_EQ(buf[j], j); + } + + concurrent_allocator.FreeUniquePtr(std::move(allocation)); + } + }; + + { + std::vector threads; + for (size_t i = 0; i < 1024; ++i) { + threads.emplace_back(th_main); + } + for (auto& th : threads) { + th.join(); + } + } + allocator.FreeUniquePtr(std::move(cuda_allocation)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc new file mode 100644 index 00000000000..3133627bf72 --- /dev/null +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include +#include + +namespace paddle { +namespace memory { +namespace allocation { + +std::unique_ptr CPUAllocator::Allocate(size_t size, Attr attr) { + void* ptr; + auto status = posix_memalign(&ptr, kAlignment, size); + if (UNLIKELY(status) != 0) { + throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d", + size, status)); + } + return std::unique_ptr(new CPUAllocation(ptr, size)); +} +void CPUAllocator::Free(Allocation* allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); + free(allocation->ptr()); +} + +bool CPUAllocator::IsAllocThreadSafe() const { return true; } +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h new file mode 100644 index 00000000000..e3f35685d7e --- /dev/null +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CPUAllocation : public Allocation { + public: + CPUAllocation(void* ptr, size_t size) + : Allocation(ptr, size, platform::CPUPlace()) {} +}; + +class CPUAllocator : public UnmanagedAllocator { + public: + constexpr static size_t kAlignment = 64u; + std::unique_ptr Allocate(size_t size, + Attr attr = kDefault) override; + void Free(Allocation* allocation) override; + bool IsAllocThreadSafe() const override; +}; +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc new file mode 100644 index 00000000000..14e08683321 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#include +#include +#include +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CUDADeviceGuard { + public: + explicit CUDADeviceGuard(int dev_id) { + int prev_id = platform::GetCurrentDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + platform::SetDeviceId(dev_id); + } + } + + ~CUDADeviceGuard() { + if (prev_id_ != -1) { + platform::SetDeviceId(prev_id_); + } + } + + private: + int prev_id_{-1}; +}; + +std::unique_ptr CUDAAllocator::Allocate(size_t size, Attr attr) { + CUDADeviceGuard guard(place_.device); + void* ptr; + auto status = cudaMalloc(&ptr, size); + if (UNLIKELY(status != cudaSuccess)) { + throw BadAlloc(string::Sprintf( + "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, + status, cudaGetErrorString(status))); + } + + return std::unique_ptr( + new CUDAAllocation(ptr, size, platform::Place(place_))); +} + +void CUDAAllocator::Free(Allocation* allocation) { + auto* cuda_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(cuda_allocation); + PADDLE_ENFORCE_EQ(boost::get(cuda_allocation->place()), + place_); + PADDLE_ENFORCE(cudaFree(allocation->ptr())); +} +bool CUDAAllocator::IsAllocThreadSafe() const { return true; } +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h new file mode 100644 index 00000000000..4bd4c00f976 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +// Just a flag type. +class CUDAAllocation : public Allocation { + public: + using Allocation::Allocation; +}; + +class CUDAAllocator : public UnmanagedAllocator { + public: + explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} + explicit CUDAAllocator(const platform::Place& place) + : place_(boost::get(place)) {} + std::unique_ptr Allocate(size_t size, + Attr attr = kDefault) override; + void Free(Allocation* allocation) override; + bool IsAllocThreadSafe() const override; + + private: + platform::CUDAPlace place_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc new file mode 100644 index 00000000000..1e0febe10bb --- /dev/null +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/locked_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +std::unique_ptr LockedAllocator::Allocate(size_t size, Attr attr) { + if (underlying_allocator_->IsAllocThreadSafe()) { + return underlying_allocator_->Allocate(size, attr); + } else { + std::lock_guard guard(mtx_); + return underlying_allocator_->Allocate(size, attr); + } +} +void LockedAllocator::Free(Allocation *allocation) { + if (underlying_allocator_->IsAllocThreadSafe()) { + return underlying_allocator_->Free(allocation); + } else { + std::lock_guard guard(mtx_); + return underlying_allocator_->Free(allocation); + } +} +bool LockedAllocator::IsAllocThreadSafe() const { return true; } + +LockedAllocator::LockedAllocator( + std::unique_ptr &&underlying_allocator) { + auto *allocator = + dynamic_cast(underlying_allocator.get()); + PADDLE_ENFORCE_NOT_NULL(allocator); + underlying_allocator.release(); + underlying_allocator_.reset(allocator); +} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h new file mode 100644 index 00000000000..eed263f3bc5 --- /dev/null +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class LockedAllocator : public UnmanagedAllocator { + public: + explicit LockedAllocator(std::unique_ptr&& underlying_allocator); + std::unique_ptr Allocate(size_t size, + Attr attr = kDefault) override; + void Free(Allocation* allocation) override; + bool IsAllocThreadSafe() const override; + + private: + std::unique_ptr underlying_allocator_; + std::mutex mtx_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator.cc b/paddle/fluid/memory/allocation/naive_managed_allocator.cc new file mode 100644 index 00000000000..2a61aee8433 --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_managed_allocator.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/naive_managed_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +NaiveManagedAllocator::NaiveManagedAllocator( + std::unique_ptr &&allocator) { + auto *underlying_allocator = + dynamic_cast(allocator.get()); + PADDLE_ENFORCE_NOT_NULL(underlying_allocator); + allocator.release(); + Init(std::unique_ptr(underlying_allocator)); +} + +NaiveManagedAllocator::NaiveManagedAllocator( + std::unique_ptr &&allocator) { + Init(std::move(allocator)); +} +void NaiveManagedAllocator::Init( + std::unique_ptr &&allocator) { + underlying_allocator_ = std::move(allocator); +} +bool NaiveManagedAllocator::IsAllocThreadSafe() const { + return underlying_allocator_->IsAllocThreadSafe(); +} +std::unique_ptr NaiveManagedAllocator::Allocate(size_t size, + Attr attr) { + std::unique_ptr allocation = + underlying_allocator_->Allocate(size, attr); + return std::unique_ptr( + new NaiveManagedAllocation(std::move(allocation), shared_from_this())); +} +std::shared_ptr NaiveManagedAllocator::AllocateShared(size_t size, + Attr attr) { + std::unique_ptr allocation = + underlying_allocator_->Allocate(size, attr); + return std::shared_ptr( + new NaiveManagedAllocation(std::move(allocation), shared_from_this())); +} + +NaiveManagedAllocation::~NaiveManagedAllocation() { + auto allocator = allocator_.lock(); + if (UNLIKELY(allocator == nullptr)) { + // the allocator is destructed before allocations. + // do nothing. + return; + } + // invoke Free + allocator->UnderlyingAllocator().FreeUniquePtr( + std::move(underlying_allocation_)); +} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator.h b/paddle/fluid/memory/allocation/naive_managed_allocator.h new file mode 100644 index 00000000000..3291eeaadb6 --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_managed_allocator.h @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NaiveManagedAllocator; +class NaiveManagedAllocation : public Allocation { + public: + NaiveManagedAllocation(std::unique_ptr&& underlying_allocation, + std::shared_ptr allocator) + : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)), + allocator_(allocator) {} + + ~NaiveManagedAllocation() final; + + private: + std::unique_ptr underlying_allocation_; + std::weak_ptr allocator_; +}; + +class NaiveManagedAllocator + : public ManagedAllocator, + public std::enable_shared_from_this { + public: + template + static std::shared_ptr Create(ARGS... args) { + return std::static_pointer_cast( + std::shared_ptr( + new NaiveManagedAllocator(std::move(args)...))); + } + + inline UnmanagedAllocator& UnderlyingAllocator() { + return *underlying_allocator_; + } + + bool IsAllocThreadSafe() const override; + std::unique_ptr Allocate(size_t size, + Attr attr = kDefault) override; + std::shared_ptr AllocateShared(size_t size, + Attr attr = kDefault) override; + + private: + explicit NaiveManagedAllocator(std::unique_ptr&& allocator); + explicit NaiveManagedAllocator( + std::unique_ptr&& allocator); + void Init(std::unique_ptr&& allocator); + + std::unique_ptr underlying_allocator_; +}; +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_managed_allocator_test.cc b/paddle/fluid/memory/allocation/naive_managed_allocator_test.cc new file mode 100644 index 00000000000..027fdec26de --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_managed_allocator_test.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/naive_managed_allocator.h" +#include // NOLINT +#include +#include // NOLINT +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class StubAllocator : public UnmanagedAllocator { + public: + std::unique_ptr Allocate(size_t size, + Attr attr = kDefault) override { + counter_.fetch_add(1); + return std::unique_ptr( + new Allocation(nullptr, size, platform::CPUPlace())); + } + void Free(Allocation* allocation) override { counter_.fetch_sub(1); } + bool IsAllocThreadSafe() const override { return true; } + + std::atomic counter_{0}; +}; + +TEST(NaiveManagedAllocator, main) { + auto allocator = NaiveManagedAllocator::Create( + std::unique_ptr(new StubAllocator())); + + auto th_main = [=] { + std::random_device dev; + std::default_random_engine engine(dev()); + std::uniform_int_distribution dist(0, 1); + + std::vector> allocations; + + for (int j = 0; j < 1024; ++j) { + bool to_insert = static_cast(dist(engine)); + if (to_insert) { + allocations.emplace_back(allocator->AllocateShared(10)); + } else { + if (!allocations.empty()) { + allocations.pop_back(); + } + } + } + }; + + { + std::vector threads; + for (size_t i = 0; i < 1024; ++i) { + threads.emplace_back(th_main); + } + for (auto& th : threads) { + th.join(); + } + } + ASSERT_EQ(reinterpret_cast( + std::dynamic_pointer_cast(allocator) + ->UnderlyingAllocator()) + .counter_, + 0); +} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 283745e9775..4f289f75379 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -14,13 +14,9 @@ limitations under the License. */ #include -#include "paddle/fluid/memory/malloc.h" - #include "glog/logging.h" - -#include "paddle/fluid/memory/detail/buddy_allocator.h" -#include "paddle/fluid/memory/detail/system_allocator.h" -#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/malloc.h" DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " @@ -33,172 +29,14 @@ DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { namespace memory { -using BuddyAllocator = detail::BuddyAllocator; - -BuddyAllocator* GetCPUBuddyAllocator() { - static std::once_flag init_flag; - static detail::BuddyAllocator* a = nullptr; - - std::call_once(init_flag, []() { - a = new detail::BuddyAllocator( - std::unique_ptr(new detail::CPUAllocator), - platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); - }); - - return a; -} - -template <> -void* Alloc(platform::CPUPlace place, size_t size) { - VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); - void* p = GetCPUBuddyAllocator()->Alloc(size); - if (FLAGS_init_allocated_mem) { - memset(p, 0xEF, size); - } - VLOG(10) << " pointer=" << p; - return p; -} - -template <> -void Free(platform::CPUPlace place, void* p) { - VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); - GetCPUBuddyAllocator()->Free(p); -} - -template <> -size_t Used(platform::CPUPlace place) { - return GetCPUBuddyAllocator()->Used(); -} - -#ifdef PADDLE_WITH_CUDA - -BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { - static std::once_flag init_flag; - static detail::BuddyAllocator** a_arr = nullptr; - - std::call_once(init_flag, [gpu_id]() { - int gpu_num = platform::GetCUDADeviceCount(); - PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, - gpu_num); - - a_arr = new BuddyAllocator*[gpu_num]; - for (int i = 0; i < gpu_num; i++) { - a_arr[i] = nullptr; - platform::SetDeviceId(i); - a_arr[i] = new BuddyAllocator( - std::unique_ptr(new detail::GPUAllocator(i)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); - - VLOG(10) << "\n\nNOTE: each GPU device use " - << FLAGS_fraction_of_gpu_memory_to_use * 100 - << "% of GPU memory.\n" - << "You can set GFlags environment variable '" - << "FLAGS_fraction_of_gpu_memory_to_use" - << "' to change the fraction of GPU usage.\n\n"; - } - }); - - platform::SetDeviceId(gpu_id); - return a_arr[gpu_id]; -} - -template <> -size_t Used(platform::CUDAPlace place) { - return GetGPUBuddyAllocator(place.device)->Used(); +std::shared_ptr AllocShared(const platform::Place& place, + size_t size, Allocator::Attr attr) { + return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr); } -template <> -void* Alloc(platform::CUDAPlace place, size_t size) { - auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - auto* ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr) { - int cur_dev = platform::GetCurrentDeviceId(); - platform::SetDeviceId(place.device); - size_t avail, total; - platform::GpuMemoryUsage(&avail, &total); - LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " - << place.device << ", available " << avail << " bytes"; - LOG(WARNING) << "total " << total; - LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize(); - LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize(); - LOG(WARNING) << "GPU memory used: " << Used(place); - platform::SetDeviceId(cur_dev); - } - if (FLAGS_init_allocated_mem) { - cudaMemset(ptr, 0xEF, size); - } - return ptr; +std::unique_ptr Alloc(const platform::Place& place, size_t size, + Allocator::Attr attr) { + return allocation::AllocatorFacade::Instance().Alloc(place, size, attr); } - -template <> -void Free(platform::CUDAPlace place, void* p) { - GetGPUBuddyAllocator(place.device)->Free(p); -} - -BuddyAllocator* GetCUDAPinnedBuddyAllocator() { - static std::once_flag init_flag; - static BuddyAllocator* ba = nullptr; - - std::call_once(init_flag, []() { - ba = new BuddyAllocator(std::unique_ptr( - new detail::CUDAPinnedAllocator), - platform::CUDAPinnedMinChunkSize(), - platform::CUDAPinnedMaxChunkSize()); - }); - - return ba; -} - -template <> -size_t Used(platform::CUDAPinnedPlace place) { - return GetCUDAPinnedBuddyAllocator()->Used(); -} - -template <> -void* Alloc(platform::CUDAPinnedPlace place, - size_t size) { - auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(); - void* ptr = buddy_allocator->Alloc(size); - - if (ptr == nullptr) { - LOG(WARNING) << "cudaMallocHost Cannot allocate " << size - << " bytes in CUDAPinnedPlace"; - } - if (FLAGS_init_allocated_mem) { - memset(ptr, 0xEF, size); - } - return ptr; -} - -template <> -void Free(platform::CUDAPinnedPlace place, void* p) { - GetCUDAPinnedBuddyAllocator()->Free(p); -} -#endif - -size_t Usage::operator()(const platform::CPUPlace& cpu) const { - return Used(cpu); -} - -size_t Usage::operator()(const platform::CUDAPlace& gpu) const { -#ifdef PADDLE_WITH_CUDA - return Used(gpu); -#else - PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); -#endif -} - -size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const { -#ifdef PADDLE_WITH_CUDA - return Used(cuda_pinned); -#else - PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); -#endif -} - -size_t memory_usage(const platform::Place& p) { - return boost::apply_visitor(Usage(), p); -} - } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 3e6bfddd69c..061ca97dd8e 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -14,91 +14,21 @@ limitations under the License. */ #pragma once +#include +#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" - namespace paddle { namespace memory { +using allocation::Allocation; +using allocation::Allocator; -/** - * \brief Allocate memory block in one place. - * - * \param[in] place Allocation place (CPU or GPU). - * \param[in] size Allocation size. - * - * \return Allocated memory block address. - * - * \note If return nullptr, it indicates memory allocation failed - * because insufficient memory in current system. When Alloc - * function is invoked, you must check the returned memory - * address is valid or not. - */ -template -void* Alloc(Place place, size_t size); - -/** - * \brief Free memory block in one place. - * - * \param[in] place Allocation place (CPU or GPU). - * \param[in] ptr Memory block address to free. - * - */ -template -void Free(Place place, void* ptr); - -/** - * \brief Total size of used memory in one place. - * - * \param[in] place Allocation place (CPU or GPU). - * - */ -template -size_t Used(Place place); - -struct Usage : public boost::static_visitor { - size_t operator()(const platform::CPUPlace& cpu) const; - size_t operator()(const platform::CUDAPlace& gpu) const; - size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const; -}; - -size_t memory_usage(const platform::Place& p); - -/** - * \brief Free memory block in one place. - * - * \note In some cases, custom deleter is used to - * deallocate the memory automatically for - * std::unique_ptr in tensor.h. - * - */ -template -class PODDeleter { - static_assert(std::is_pod::value, "T must be POD"); - - public: - explicit PODDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, static_cast(ptr)); } - - private: - Place place_; -}; - -/** - * \brief Free memory block in one place does not meet POD - * - * \note In some cases, custom deleter is used to - * deallocate the memory automatically for - * std::unique_ptr in tensor.h. - * - */ -template -class PlainDeleter { - public: - explicit PlainDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, reinterpret_cast(ptr)); } +extern std::shared_ptr AllocShared( + const platform::Place& place, size_t size, + Allocator::Attr attr = Allocator::kDefault); - private: - Place place_; -}; +extern std::unique_ptr Alloc( + const platform::Place& place, size_t size, + Allocator::Attr attr = Allocator::kDefault); } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc_test.cc b/paddle/fluid/memory/malloc_test.cc deleted file mode 100644 index d39466ef60c..00000000000 --- a/paddle/fluid/memory/malloc_test.cc +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/memory/malloc.h" - -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/gpu_info.h" -#include "paddle/fluid/platform/place.h" - -inline bool is_aligned(void const *p) { - return 0 == (reinterpret_cast(p) & 0x3); -} - -size_t align(size_t size, paddle::platform::CPUPlace place) { - size += sizeof(paddle::memory::detail::MemoryBlock::Desc); - size_t alignment = paddle::platform::CpuMinChunkSize(); - size_t remaining = size % alignment; - return remaining == 0 ? size : size + (alignment - remaining); -} - -TEST(BuddyAllocator, CPUAllocation) { - void *p = nullptr; - - EXPECT_EQ(p, nullptr); - - paddle::platform::CPUPlace cpu; - p = paddle::memory::Alloc(cpu, 4096); - - EXPECT_NE(p, nullptr); - - paddle::platform::Place place = cpu; - EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place)); - - paddle::memory::Free(cpu, p); -} - -TEST(BuddyAllocator, CPUMultAlloc) { - paddle::platform::CPUPlace cpu; - - std::unordered_map ps; - - size_t total_size = paddle::memory::Used(cpu); - EXPECT_EQ(total_size, 0UL); - - for (auto size : - {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { - ps[paddle::memory::Alloc(cpu, size)] = size; - - // Buddy Allocator doesn't manage too large memory chunk - if (paddle::memory::Used(cpu) == total_size) continue; - - size_t aligned_size = align(size, cpu); - total_size += aligned_size; - EXPECT_EQ(total_size, paddle::memory::Used(cpu)); - } - - for (auto p : ps) { - EXPECT_EQ(is_aligned(p.first), true); - paddle::memory::Free(cpu, p.first); - - // Buddy Allocator doesn't manage too large memory chunk - if (paddle::memory::Used(cpu) == total_size) continue; - - size_t aligned_size = align(p.second, cpu); - total_size -= aligned_size; - EXPECT_EQ(total_size, paddle::memory::Used(cpu)); - } -} - -#ifdef PADDLE_WITH_CUDA - -size_t align(size_t size, paddle::platform::CUDAPlace place) { - size += sizeof(paddle::memory::detail::MemoryBlock::Desc); - size_t alignment = paddle::platform::GpuMinChunkSize(); - size_t remaining = size % alignment; - return remaining == 0 ? size : size + (alignment - remaining); -} - -TEST(BuddyAllocator, GPUAllocation) { - void *p = nullptr; - - EXPECT_EQ(p, nullptr); - - paddle::platform::CUDAPlace gpu(0); - p = paddle::memory::Alloc(gpu, 4096); - - EXPECT_NE(p, nullptr); - - paddle::platform::Place place = gpu; - EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place)); - - paddle::memory::Free(gpu, p); -} - -TEST(BuddyAllocator, GPUMultAlloc) { - paddle::platform::CUDAPlace gpu; - - std::unordered_map ps; - - size_t total_size = paddle::memory::Used(gpu); - EXPECT_EQ(total_size, 0UL); - - for (auto size : - {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { - ps[paddle::memory::Alloc(gpu, size)] = size; - - // Buddy Allocator doesn't manage too large memory chunk - if (paddle::memory::Used(gpu) == total_size) continue; - - size_t aligned_size = align(size, gpu); - total_size += aligned_size; - EXPECT_EQ(total_size, paddle::memory::Used(gpu)); - } - - for (auto p : ps) { - EXPECT_EQ(is_aligned(p.first), true); - paddle::memory::Free(gpu, p.first); - - // Buddy Allocator doesn't manage too large memory chunk - if (paddle::memory::Used(gpu) == total_size) continue; - - size_t aligned_size = align(p.second, gpu); - total_size -= aligned_size; - EXPECT_EQ(total_size, paddle::memory::Used(gpu)); - } -} - -size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) { - size += sizeof(paddle::memory::detail::MemoryBlock::Desc); - size_t alignment = paddle::platform::CUDAPinnedMinChunkSize(); - size_t remaining = size % alignment; - return remaining == 0 ? size : size + (alignment - remaining); -} - -TEST(BuddyAllocator, CUDAPinnedAllocator) { - void *p = nullptr; - - EXPECT_EQ(p, nullptr); - - paddle::platform::CUDAPinnedPlace cpu; - p = paddle::memory::Alloc(cpu, 4096); - - EXPECT_NE(p, nullptr); - - paddle::platform::Place place = cpu; - EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place)); - - paddle::memory::Free(cpu, p); -} - -TEST(BuddyAllocator, CUDAPinnedMultAllocator) { - paddle::platform::CUDAPinnedPlace cpu; - - std::unordered_map ps; - - size_t total_size = paddle::memory::Used(cpu); - EXPECT_EQ(total_size, 0UL); - - for (auto size : - {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { - ps[paddle::memory::Alloc(cpu, size)] = size; - - // Buddy Allocator doesn't manage too large memory chunk - if (paddle::memory::Used(cpu) == total_size) continue; - - size_t aligned_size = align(size, cpu); - total_size += aligned_size; - EXPECT_EQ(total_size, paddle::memory::Used(cpu)); - } - - for (auto p : ps) { - EXPECT_EQ(is_aligned(p.first), true); - paddle::memory::Free(cpu, p.first); - - // Buddy Allocator doesn't manage too large memory chunk - if (paddle::memory::Used(cpu) == total_size) continue; - - size_t aligned_size = align(p.second, cpu); - total_size -= aligned_size; - EXPECT_EQ(total_size, paddle::memory::Used(cpu)); - } -} -#endif diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 6146ff509d7..d1d86e561c0 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "cub/cub.cuh" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/gather.cu.h" @@ -57,22 +58,18 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); // Determine temporary device storage requirements - void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( - d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, - num); - + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); // Allocate temporary storage auto place = boost::get(ctx.GetPlace()); - d_temp_storage = memory::Alloc(place, temp_storage_bytes); + auto d_temp_storage = + memory::Alloc(place, temp_storage_bytes, memory::Allocator::kTmp); // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( - d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, - num); - - memory::Free(place, d_temp_storage); + d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, + idx_out, num); } template @@ -248,11 +245,12 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const T *boxes = proposals.data(); auto place = boost::get(ctx.GetPlace()); int size_bytes = boxes_num * col_blocks * sizeof(uint64_t); - uint64_t *d_mask = - reinterpret_cast(memory::Alloc(place, size_bytes)); + auto d_mask_allocation = memory::Alloc(place, size_bytes); + uint64_t *d_mask = reinterpret_cast(d_mask_allocation->ptr()); NMSKernel<<>>(boxes_num, nms_threshold, boxes, d_mask); - uint64_t *h_mask = reinterpret_cast( - memory::Alloc(platform::CPUPlace(), size_bytes)); + + auto h_mask_allocation = memory::Alloc(platform::CPUPlace(), size_bytes); + uint64_t *h_mask = reinterpret_cast(h_mask_allocation->ptr()); memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0); std::vector remv(col_blocks); diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc index a6ca82d16f2..3a450773a9d 100644 --- a/paddle/fluid/operators/strided_memcpy_test.cc +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -87,13 +87,16 @@ TEST(StridedMemcpy, GPUCrop) { platform::CUDADeviceContext ctx(gpu0); - int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); + auto src_allocation = memory::Alloc(gpu0, sizeof(src)); + + int* gpu_src = reinterpret_cast(src_allocation->ptr()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); framework::DDim src_stride({5, 1}); int dst[4]; - int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); + auto dst_allocation = memory::Alloc(gpu0, sizeof(dst)); + int* gpu_dst = reinterpret_cast(dst_allocation->ptr()); framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({2, 1}); @@ -108,9 +111,6 @@ TEST(StridedMemcpy, GPUCrop) { ASSERT_EQ(2, dst[1]); ASSERT_EQ(3, dst[2]); ASSERT_EQ(4, dst[3]); - - memory::Free(gpu0, gpu_dst); - memory::Free(gpu0, gpu_src); } TEST(StridedMemcpy, GPUConcat) { @@ -124,12 +124,13 @@ TEST(StridedMemcpy, GPUConcat) { platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); - - int* gpu_src = reinterpret_cast(memory::Alloc(gpu0, sizeof(src))); + auto gpu_src_allocation = memory::Alloc(gpu0, sizeof(src)); + int* gpu_src = reinterpret_cast(gpu_src_allocation->ptr()); memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream()); int dst[8]; - int* gpu_dst = reinterpret_cast(memory::Alloc(gpu0, sizeof(dst))); + auto gpu_dst_allocation = memory::Alloc(gpu0, sizeof(dst)); + int* gpu_dst = reinterpret_cast(gpu_dst_allocation->ptr()); framework::DDim src_stride({2, 1}); framework::DDim dst_dim({2, 2}); @@ -151,9 +152,6 @@ TEST(StridedMemcpy, GPUConcat) { for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) { ASSERT_EQ(expect_dst[i], dst[i]); } - - memory::Free(gpu0, gpu_dst); - memory::Free(gpu0, gpu_src); } #endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index dfc079e986e..0b97f5123a8 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -112,11 +112,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } void* allocate(size_t num_bytes) const override { - return paddle::memory::Alloc(place_, num_bytes); + auto buf = + paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kTiny); + void* retv = buf->ptr(); + allocations_[buf->ptr()] = std::move(buf); + return retv; } void deallocate(void* buffer) const override { - paddle::memory::Free(place_, buffer); + allocations_.erase(allocations_.find(buffer)); } void* scratchpad() const override { @@ -143,12 +147,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { const cudaDeviceProp* device_prop_; // not owned; mutable void* scratch_; mutable unsigned int* semaphore_; + mutable std::unordered_map> + allocations_; }; class CudnnHolder { public: CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) - : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) { + : workspace_(nullptr), stream_(stream), place_(place) { PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); } @@ -158,36 +164,38 @@ class CudnnHolder { void RunFunc(const std::function& cudnn_func, size_t required_workspace_len) { std::lock_guard lock(mtx_); - if (required_workspace_len > workspace_len_) { + if (required_workspace_len > WorkspaceSize()) { ReallocateWorkspace(required_workspace_len); } - cudnn_func(workspace_); + cudnn_func(workspace_->ptr()); } - ~CudnnHolder() { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); - if (workspace_ != nullptr) { - paddle::memory::Free(place_, workspace_); + ~CudnnHolder() { PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); } + + private: + size_t WorkspaceSize() const { + if (workspace_ == nullptr) { + return 0; + } else { + return workspace_->size(); } } - private: void ReallocateWorkspace(size_t required_workspace_len) { - if (required_workspace_len <= workspace_len_) { + if (required_workspace_len <= WorkspaceSize()) { return; } if (workspace_ != nullptr) { // Maybe someone is using the current workspace PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); - paddle::memory::Free(place_, workspace_); + workspace_.reset(); } - workspace_ = paddle::memory::Alloc(place_, required_workspace_len); - workspace_len_ = required_workspace_len; + workspace_ = paddle::memory::Alloc(place_, required_workspace_len, + memory::Allocator::kFluxHuge); } cudnnHandle_t cudnn_handle_; - void* workspace_; - size_t workspace_len_; + std::unique_ptr workspace_; const cudaStream_t* stream_; // not owned; const CUDAPlace place_; diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index f65d1f60100..07433a151ce 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -39,7 +39,6 @@ class Multiply { } // namespace using paddle::memory::Alloc; -using paddle::memory::Free; using paddle::memory::Copy; using paddle::platform::CPUPlace; @@ -63,13 +62,13 @@ TEST(Transform, GPUUnary) { CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; - float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); + auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4); + float* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream()); Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream()); - Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(cpu_buf[i], static_cast(i + 1), 1e-5); } @@ -89,13 +88,13 @@ TEST(Transform, GPUBinary) { int buf[4] = {1, 2, 3, 4}; CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); - int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); + auto gpu_allocation = Alloc(gpu0, sizeof(buf)); + int* gpu_buf = static_cast(gpu_allocation->ptr()); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream()); - Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { ASSERT_EQ((i + 1) * (i + 1), buf[i]); } diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index dc9fad29f28..86c5f87f34d 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -41,4 +41,5 @@ limitations under the License. */ #include #include #include +#include #include diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index cfea2059c3c..b18bd70005c 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -27,8 +27,7 @@ int main(int argc, char** argv) { new_argv.push_back(argv[i]); } #ifdef PADDLE_WITH_CUDA - new_argv.push_back( - strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); + new_argv.push_back(strdup("--tryfromenv=fraction_of_gpu_memory_to_use")); #else new_argv.push_back(strdup( "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb")); @@ -37,12 +36,6 @@ int main(int argc, char** argv) { int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); - paddle::memory::Used(paddle::platform::CPUPlace()); - -#ifdef PADDLE_WITH_CUDA - paddle::memory::Used(paddle::platform::CUDAPlace(0)); -#endif - paddle::framework::InitDevices(true); return RUN_ALL_TESTS(); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7bbdf7de89c..f0032ab0fae 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -110,10 +110,10 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb' + 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', + 'use_mkldnn', 'initial_cpu_memory_in_mb', 'init_allocated_mem', + 'paddle_num_threads', "dist_threadpool_size", 'cpu_deterministic', + 'eager_delete_tensor_gb' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') -- GitLab