diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index df3d57d629184d28fd42130df9b020a7b52ade72..78443cc35a400bceac77b99c3468daf16d8a4690 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -15,7 +15,8 @@ limitations under the License. */ #include "paddle/memory/memory.h" #include "paddle/memory/detail/buddy_allocator.h" #include "paddle/memory/detail/system_allocator.h" -#include "paddle/platform/assert.h" + +#include // for memcpy namespace paddle { namespace memory { @@ -45,6 +46,13 @@ size_t Used(platform::CPUPlace place) { return GetCPUBuddyAllocator()->Used(); } +template <> +void Copy(platform::CPUPlace, void* dst, + platform::CPUPlace, + const void* src, size_t num) { + std::memcpy(dst, src, num); +} + #ifndef PADDLE_ONLY_CPU detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { @@ -77,6 +85,41 @@ size_t Used(platform::GPUPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } +template <> +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::GPUPlace src_place, + const void* src, size_t num, + cudaStream_t stream) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); +} + +template <> +void Copy(platform::GPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, size_t num, + cudaStream_t stream) { + platform::SetDeviceId(dst_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); +} + +template <> +void Copy(platform::GPUPlace dst_place, + void* dst, + platform::GPUPlace src_place, + const void* src, size_t num, + cudaStream_t stream) { + if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + } else { + platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num, + stream); + } +} + #endif // PADDLE_ONLY_CPU } // namespace memory diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index d0a64869cc3bcfa58b7e125e5d5699e6753396bf..7ef7a73bc8b25e6a637a5e89c87e3eef06174b92 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -14,20 +14,30 @@ limitations under the License. */ #pragma once +#include "paddle/platform/gpu_info.h" #include "paddle/platform/place.h" namespace paddle { namespace memory { -template +template void* Alloc(Place, size_t); -template +template void Free(Place, void*); -template +template size_t Used(Place); +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); + +#ifndef PADDLE_ONLY_CPU +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + cudaStream_t stream); +#endif // PADDLE_ONLY_CPU + template ::value>::type* = nullptr> diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index 5d440dec48e7a4cba404bc297eca5a451a144d93..b06ab8a2f184e7bb7dd9cb39f377b087c5258dc4 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -43,10 +43,26 @@ namespace platform { // For more details, please check https://stackoverflow.com/a/43870188/724872. #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +template +inline void throw_on_error(T e) { + throw_on_error(e, ""); +} + +template +inline typename std::enable_if::type throw_on_error( + int stat, const Args&... args) { + if (UNLIKELY(!(stat))) { + throw std::runtime_error( + string::Sprintf(args...) + + string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); + } +} + #ifndef PADDLE_ONLY_CPU template -inline void throw_on_error(cudaError_t e, const Args&... args) { +inline typename std::enable_if::type throw_on_error( + cudaError_t e, const Args&... args) { if (UNLIKELY(e)) { // clang-format off throw thrust::system_error( @@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) { } template -inline void throw_on_error(curandStatus_t stat, const Args&... args) { +inline typename std::enable_if::type throw_on_error( + curandStatus_t stat, const Args&... args) { if (stat != CURAND_STATUS_SUCCESS) { // clang-format off throw thrust::system_error( @@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) { } template -inline void throw_on_error(cudnnStatus_t stat, const Args&... args) { +inline typename std::enable_if::type throw_on_error( + cudnnStatus_t stat, const Args&... args) { if (stat == CUDNN_STATUS_SUCCESS) { return; } else { @@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) { } template -inline void throw_on_error(cublasStatus_t stat, const Args&... args) { +inline typename std::enable_if::type throw_on_error( + cublasStatus_t stat, const Args&... args) { std::string err; if (stat == CUBLAS_STATUS_SUCCESS) { return; @@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) { #endif // PADDLE_ONLY_CPU -template -inline void throw_on_error(int stat, const Args&... args) { - if (UNLIKELY(!(stat))) { - throw std::runtime_error( - string::Sprintf(args...) + - string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); - } -} - #define PADDLE_THROW(...) \ do { \ throw std::runtime_error( \ @@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) { string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \ } while (0) -/** - * @brief Enforce a condition, otherwise throw an EnforceNotMet - */ -#define PADDLE_ENFORCE(condition, ...) \ - do { \ - ::paddle::platform::throw_on_error(condition, __VA_ARGS__); \ +#define PADDLE_ENFORCE(...) \ + do { \ + ::paddle::platform::throw_on_error(__VA_ARGS__); \ } while (0) } // namespace platform diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index cf9921e870d47fe77c0cca80828dbf2bb36ccda8..edeb3ecd7bf8b87333813eee5b40f71030f6609f 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -44,7 +44,7 @@ void SetDeviceId(int id) { "cudaSetDevice failed in paddle::platform::SetDeviceId"); } -void GpuMemoryUsage(size_t& available, size_t& total) { +void GpuMemoryUsage(size_t &available, size_t &total) { PADDLE_ENFORCE(cudaMemGetInfo(&available, &total), "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); } @@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() { return usable; } +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream) { + PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), + "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync"); +} + +void GpuMemcpySync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind) { + PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind), + "cudaMemcpy failed in paddle::platform::GpuMemcpySync"); + // note: cudaMemcpy may actually be asynchronous with respect to the caller, + // block on stream 0 to make sure the copy has completed + PADDLE_ENFORCE( + cudaStreamSynchronize(0), + "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync"); +} + +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, cudaStream_t stream) { + PADDLE_ENFORCE( + cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), + "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer"); +} } // namespace platform } // namespace paddle diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h index 79e71956bd32e8c253ac4192a04e5903bed1c94a..d3a5f5f13fdd3dd59eb43465da4a64b0d8d95e5b 100644 --- a/paddle/platform/gpu_info.h +++ b/paddle/platform/gpu_info.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifndef PADDLE_ONLY_CPU +#include #include namespace paddle { @@ -31,7 +32,7 @@ int GetCurrentDeviceId(); void SetDeviceId(int device_id); //!Get the memory usage of current GPU device. -void GpuMemoryUsage(size_t& available, size_t& total); +void GpuMemoryUsage(size_t &available, size_t &total); //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); @@ -42,6 +43,18 @@ size_t GpuMinChunkSize(); //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); +//! Copy memory from address src to dst asynchronously. +void GpuMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream); + +//! Copy memory from address src to dst synchronously. +void GpuMemcpySync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind); + +//! Copy memory from one device to another device. +void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, + size_t count, cudaStream_t stream); + } // namespace platform } // namespace paddle