From 169022d0148a77cd10f16a82e841a75750e7e173 Mon Sep 17 00:00:00 2001 From: liaogang Date: Thu, 29 Jun 2017 14:04:47 +0800 Subject: [PATCH] FIX: Improve fallback gpu allocator --- paddle/memory/detail/CMakeLists.txt | 4 +- paddle/memory/detail/system_allocator.cc | 64 ++++++++++++++----- paddle/memory/detail/system_allocator.h | 15 +++-- paddle/memory/detail/system_allocator_test.cc | 14 ++-- paddle/platform/CMakeLists.txt | 4 ++ paddle/platform/cpu_info_test.cc | 18 ++++++ paddle/platform/cuda.h | 40 ------------ 7 files changed, 85 insertions(+), 74 deletions(-) create mode 100644 paddle/platform/cpu_info_test.cc delete mode 100644 paddle/platform/cuda.h diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 72d3749ad78..6caa97a76bb 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,6 +1,8 @@ if(${WITH_GPU}) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags) - nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) + nv_test(system_allocator_test + SRCS system_allocator_test.cc + DEPS system_allocator gpu_info gflags) else(${WITH_GPU}) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags) diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 50bec926f83..332ff062d47 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -13,32 +13,39 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/memory/detail/system_allocator.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/error.h" +#include "paddle/platform/gpu_info.h" #include // for malloc and free #include // for mlock and munlock #include "gflags/gflags.h" -#include "paddle/platform/assert.h" -#include "paddle/platform/cuda.h" // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange // between host and device. Allocates too much would reduce the amount // of memory available to the system for paging. So, by default, we // should set false to use_pinned_memory. -DEFINE_bool(use_pinned_memory, false, - "If set, allocate cpu/gpu pinned memory."); +DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory."); namespace paddle { namespace memory { namespace detail { -void* CPUAllocator::Alloc(size_t size) { +void* CPUAllocator::Alloc(size_t& index, size_t size) { // According to http://www.cplusplus.com/reference/cstdlib/malloc/, // malloc might not return nullptr if size is zero, but the returned // pointer shall not be dereferenced -- so we make it nullptr. if (size <= 0) return nullptr; + if (FLAGS_use_pinned_memory) { + void* p = malloc(size); + if (p != nullptr) { + mlock(p, size); + } + } + void* p = malloc(size); if (p != nullptr && FLAGS_use_pinned_memory) { mlock(p, size); @@ -46,7 +53,7 @@ void* CPUAllocator::Alloc(size_t size) { return p; } -void CPUAllocator::Free(void* p, size_t size) { +void CPUAllocator::Free(void* p, size_t size, size_t index) { if (p != nullptr && FLAGS_use_pinned_memory) { munlock(p, size); } @@ -55,29 +62,52 @@ void CPUAllocator::Free(void* p, size_t size) { #ifndef PADDLE_ONLY_CPU -void* GPUAllocator::Alloc(size_t size) { +void* GPUAllocator::Alloc(size_t& index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. - if (size <= 0) { - return nullptr; - } + if (size <= 0) return nullptr; + size_t available = 0; + size_t capacity = 0; + paddle::platform::GpuMemoryUsage(available, capacity); + + // Reserve memory for page tables, etc. + size_t reserving = capacity - paddle::platform::GpuMaxAllocSize(); + size_t remaining = available > reserving ? available - reserving : 0; + + // If remaining size no less than expected size, using general + // cudaMalloc to allocate GPU memory. void* p = 0; - cudaError_t result = - FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); - if (result != cudaSuccess) { - cudaGetLastError(); // clear error if there is any. + if (size <= remaining) { + cudaError_t result = cudaMalloc(&p, size); + if (result == cudaSuccess) { + index = 0; + total_alloc_size_ += size; + return p; + } } - return result == cudaSuccess ? p : nullptr; + + // If remaining size less than expected size or cudaMalloc failed, + // cudaMallocHost will be considered as a fallback allocator. + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + index = 1; + total_alloc_size_ += size; + return p; + } + + return nullptr; } -void GPUAllocator::Free(void* p, size_t size) { +void GPUAllocator::Free(void* p, size_t size, size_t index) { // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if // cudaFree succeeds. - cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p); + PADDLE_ASSERT(total_alloc_size_ >= size); + total_alloc_size_ -= size; + cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p); if (err != cudaErrorCudartUnloading) { platform::throw_on_error(err, "cudaFree{Host} failed"); } diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index 184b383f7f7..e15302ce4f0 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -30,21 +30,24 @@ namespace detail { class SystemAllocator { public: virtual ~SystemAllocator() {} - virtual void* Alloc(size_t size) = 0; - virtual void Free(void* p, size_t size) = 0; + virtual void* Alloc(size_t& index, size_t size) = 0; + virtual void Free(void* p, size_t size, size_t index) = 0; }; class CPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t size); - virtual void Free(void* p, size_t size); + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); }; #ifndef PADDLE_ONLY_CPU class GPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t size); - virtual void Free(void* p, size_t size); + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + + private: + size_t total_alloc_size_ = 0; }; #endif // PADDLE_ONLY_CPU diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc index 9bd5706a4e4..ba44e06ddb6 100644 --- a/paddle/memory/detail/system_allocator_test.cc +++ b/paddle/memory/detail/system_allocator_test.cc @@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory); void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { bool freed = false; { - void* p = a.Alloc(size); + size_t index; + void* p = a.Alloc(index, size); if (size > 0) { EXPECT_NE(p, nullptr); } else { @@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { int* i = static_cast(p); std::shared_ptr ptr(i, [&](void* p) { freed = true; - a.Free(p, size); + a.Free(p, size, index); }); } EXPECT_TRUE(freed); @@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) { } #ifndef PADDLE_ONLY_CPU -TEST(GPUAllocator, NoStaging) { - FLAGS_use_pinned_memory = false; - paddle::memory::detail::GPUAllocator a; - TestAllocator(a, 2048); - TestAllocator(a, 0); -} -TEST(GPUAllocator, Staging) { - FLAGS_use_pinned_memory = true; +TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a; TestAllocator(a, 2048); TestAllocator(a, 0); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 7abe2ab89e0..17342356d60 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -1,3 +1,7 @@ +cc_library(cpu_info SRCS cpu_info.cc) +cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info gflags) + +nv_library(gpu_info SRCS gpu_info.cc) nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc new file mode 100644 index 00000000000..5b7ce7c763e --- /dev/null +++ b/paddle/platform/cpu_info_test.cc @@ -0,0 +1,18 @@ +#include "paddle/platform/cpu_info.h" + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +DECLARE_double(fraction_of_cpu_memory_to_use); + +TEST(CpuMemoryUsage, Print) { + std::stringstream ss; + size_t mem_size = paddle::platform::CpuTotalMemory() / 1024 / 1024 / 1024; + ss << std::to_string( + static_cast(FLAGS_fraction_of_cpu_memory_to_use * 100)) + << "% of CPU Memory Usage: " << mem_size << " GB"; + std::cout << ss.str(); +} diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h deleted file mode 100644 index 8fe891f9ce6..00000000000 --- a/paddle/platform/cuda.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifndef PADDLE_ONLY_CPU - -#include -#include - -namespace paddle { -namespace platform { - -inline void throw_on_error(cudaError_t e, const char* message) { - if (e) { - throw thrust::system_error(e, thrust::cuda_category(), message); - } -} - -int GetDeviceCount(void) { - int count; - throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed"); - return count; -} - -} // namespace platform -} // namespace paddle - -#endif // PADDLE_ONLY_CPU -- GitLab