diff --git a/CMakeLists.txt b/CMakeLists.txt index c5d7f2c7ec76dcc7befcd16798d26a7d54a19328..3c719d35eced2420b7891dbaf507ba07cd78baf8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,7 +71,7 @@ if(ANDROID) "Disable RDMA when cross-compiling for Android" FORCE) endif(ANDROID) -set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") if (WITH_C_API AND WITH_PYTHON) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 69e8164a00d1fb57b79c63ba88c2846d30d80cd2..840155750e1ac6d59b43d0f35f41280bc117d880 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -78,6 +78,10 @@ # # cc_test(example_test SRCS example_test.cc DEPS example glog gflags) +if(WITH_GPU) + add_definitions(-DPADDLE_WITH_GPU) +endif() + if(NOT APPLE) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index fb8a11062da91a87723d445acfb592f715951f14..c425e9f947d07009a474c5dfd05f55c48f290fa0 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1 +1,5 @@ -cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) +if(${WITH_GPU}) + nv_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # nv_test links CUDA, but +else(${WITH_GPU}) + cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) # cc_test doesn't. +endif(${WITH_GPU}) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h index 8a872d3800d3d73d920fe1cade972a983c074844..0d8ea3f52b92f24ce4e3fc8f5c2a9fcbba035e8f 100644 --- a/paddle/memory/detail/cpu_allocator.h +++ b/paddle/memory/detail/cpu_allocator.h @@ -17,6 +17,11 @@ limitations under the License. */ #include // for malloc and free #include // for size_t +#ifdef PADDLE_WITH_GPU +#include +#include +#endif // PADDLE_WITH_GPU + namespace paddle { namespace memory { namespace detail { @@ -40,9 +45,9 @@ public: void Free(void* p) { free(p); } }; -// If CMake macro WITH_GPU is OFF, C++ compiler won't generate the +// If CMake macro PADDLE_WITH_GPU is OFF, C++ compiler won't generate the // following specialization that depends on the CUDA library. -#ifdef WITH_GPU +#ifdef PADDLE_WITH_GPU template <> class CPUAllocator { public: @@ -51,12 +56,12 @@ public: if (cudaMallocHost(&p, size) != cudaSuccess) { return NULL; } - return *p; + return p; } void Free(void* p) { cudaFreeHost(p); } }; -#endif // WITH_GPU +#endif // PADDLE_WITH_GPU } // namespace detail } // namespace memory diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc index 0aa33a22fd0bc886b44cef259a008f52f5418f5c..464bc84e5c7b5066bff4f3444c686473fc925746 100644 --- a/paddle/memory/detail/cpu_allocator_test.cc +++ b/paddle/memory/detail/cpu_allocator_test.cc @@ -22,11 +22,17 @@ TEST(CPUAllocator, NonStaging) { a.Free(p); } -#ifdef WITH_GPU +#ifdef PADDLE_WITH_GPU TEST(CPUAllocator, Staging) { paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p); + + int devices; + if (cudaGetDeviceCount(&devices) == cudaSuccess && devices > 0) { + void* p = a.Alloc(4096); + EXPECT_NE(p, nullptr); + a.Free(p); + } else { + EXPECT_EQ(a.Alloc(4096), nullptr); + } } -#endif // WITH_GPU +#endif // PADDLE_WITH_GPU diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 5f1253ede68180ae6a051a56e6f2f1787edf24b8..b617923731a4d92e9765e2b73c55984a70a59264 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -19,7 +19,11 @@ namespace memory { template <> void* Alloc(CPUPlace, size_t size) { - return GetCPUBuddyAllocator()->Alloc(size); + return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size); +} + +void* AllocStaging(CPUPlace, size_t size) { + return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size); } template <> @@ -29,9 +33,14 @@ void* Alloc(GPUPlace pl, size_t size) { template <> void Free(CPUPlace, void* p) { - return GetCPUBuddyAllocator()->Free(p); + return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); +} + +void FreeStaging(CPUPlace, void* p) { + return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); } +#ifdef PADDLE_WITH_GPU template <> void* Alloc(GPUPlace pl, void* p) { return GetGPUBuddyAllocator(pl.device)->Free(p); @@ -46,6 +55,7 @@ template <> size_t Alloc(GPUPlace pl) { return GetGPUBuddyAllocator(pl.device)->Used(); } +#endif // PADDLE_WITH_GPU } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index ae8ac6ca523ad93fa77847808a19d6ee0c397e31..8c15a133bb4e9762d4264ee0d02ad96a3ed33e30 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -19,9 +19,19 @@ limitations under the License. */ namespace paddle { namespace memory { -typename void* Alloc(Place, size_t); -typename void Free(Place, void*); -typename size_t Used(Place); +template +void* Alloc(Place, size_t); +template +void Free(Place, void*); +template +size_t Used(Place); + +// Staging memory means "pinned" host memory that can be mapped into +// the CUDA memory space and accessed by the device rapidly. Don't +// allocate too much staging memory; otherwise system performance will +// degrade because the OS cannot find enough swap memory space. +void* AllocStaging(CPUPlace, size_t); +void* FreeStaging(CPUPlace, size_t); } // namespace memory } // namespace paddle