提交 e1140f2b 编写于 作者: Y Yi Wang 提交者: GitHub

Merge pull request #2970 from gangliao/memcpy

Add CPU/GPU Memcpy in memory folder
...@@ -15,7 +15,8 @@ limitations under the License. */ ...@@ -15,7 +15,8 @@ limitations under the License. */
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/memory/detail/buddy_allocator.h" #include "paddle/memory/detail/buddy_allocator.h"
#include "paddle/memory/detail/system_allocator.h" #include "paddle/memory/detail/system_allocator.h"
#include "paddle/platform/assert.h"
#include <cstring> // for memcpy
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -45,6 +46,13 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) { ...@@ -45,6 +46,13 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used(); return GetCPUBuddyAllocator()->Used();
} }
template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace,
const void* src, size_t num) {
std::memcpy(dst, src, num);
}
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
...@@ -77,6 +85,41 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) { ...@@ -77,6 +85,41 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
} }
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else {
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
stream);
}
}
#endif // PADDLE_ONLY_CPU #endif // PADDLE_ONLY_CPU
} // namespace memory } // namespace memory
......
...@@ -14,20 +14,30 @@ limitations under the License. */ ...@@ -14,20 +14,30 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/platform/gpu_info.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
template <class Place> template <typename Place>
void* Alloc(Place, size_t); void* Alloc(Place, size_t);
template <class Place> template <typename Place>
void Free(Place, void*); void Free(Place, void*);
template <class Place> template <typename Place>
size_t Used(Place); size_t Used(Place);
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#ifndef PADDLE_ONLY_CPU
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream);
#endif // PADDLE_ONLY_CPU
template <typename T, /* must be POD types */ template <typename T, /* must be POD types */
typename Place /* platform::GPUPlace or platform::CPUPlace */, typename Place /* platform::GPUPlace or platform::CPUPlace */,
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr> typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
......
...@@ -43,10 +43,26 @@ namespace platform { ...@@ -43,10 +43,26 @@ namespace platform {
// For more details, please check https://stackoverflow.com/a/43870188/724872. // For more details, please check https://stackoverflow.com/a/43870188/724872.
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
template <typename T>
inline void throw_on_error(T e) {
throw_on_error(e, "");
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
int stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
throw std::runtime_error(
string::Sprintf(args...) +
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
}
}
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
template <typename... Args> template <typename... Args>
inline void throw_on_error(cudaError_t e, const Args&... args) { inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) {
if (UNLIKELY(e)) { if (UNLIKELY(e)) {
// clang-format off // clang-format off
throw thrust::system_error( throw thrust::system_error(
...@@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) { ...@@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) {
} }
template <typename... Args> template <typename... Args>
inline void throw_on_error(curandStatus_t stat, const Args&... args) { inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) {
if (stat != CURAND_STATUS_SUCCESS) { if (stat != CURAND_STATUS_SUCCESS) {
// clang-format off // clang-format off
throw thrust::system_error( throw thrust::system_error(
...@@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) { ...@@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) {
} }
template <typename... Args> template <typename... Args>
inline void throw_on_error(cudnnStatus_t stat, const Args&... args) { inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudnnStatus_t stat, const Args&... args) {
if (stat == CUDNN_STATUS_SUCCESS) { if (stat == CUDNN_STATUS_SUCCESS) {
return; return;
} else { } else {
...@@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) { ...@@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
} }
template <typename... Args> template <typename... Args>
inline void throw_on_error(cublasStatus_t stat, const Args&... args) { inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cublasStatus_t stat, const Args&... args) {
std::string err; std::string err;
if (stat == CUBLAS_STATUS_SUCCESS) { if (stat == CUBLAS_STATUS_SUCCESS) {
return; return;
...@@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) { ...@@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
#endif // PADDLE_ONLY_CPU #endif // PADDLE_ONLY_CPU
template <typename... Args>
inline void throw_on_error(int stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
throw std::runtime_error(
string::Sprintf(args...) +
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
}
}
#define PADDLE_THROW(...) \ #define PADDLE_THROW(...) \
do { \ do { \
throw std::runtime_error( \ throw std::runtime_error( \
...@@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) { ...@@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) {
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \ string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
} while (0) } while (0)
/** #define PADDLE_ENFORCE(...) \
* @brief Enforce a condition, otherwise throw an EnforceNotMet
*/
#define PADDLE_ENFORCE(condition, ...) \
do { \ do { \
::paddle::platform::throw_on_error(condition, __VA_ARGS__); \ ::paddle::platform::throw_on_error(__VA_ARGS__); \
} while (0) } while (0)
} // namespace platform } // namespace platform
......
...@@ -44,7 +44,7 @@ void SetDeviceId(int id) { ...@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
"cudaSetDevice failed in paddle::platform::SetDeviceId"); "cudaSetDevice failed in paddle::platform::SetDeviceId");
} }
void GpuMemoryUsage(size_t& available, size_t& total) { void GpuMemoryUsage(size_t &available, size_t &total) {
PADDLE_ENFORCE(cudaMemGetInfo(&available, &total), PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
} }
...@@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() { ...@@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() {
return usable; return usable;
} }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
// note: cudaMemcpy may actually be asynchronous with respect to the caller,
// block on stream 0 to make sure the copy has completed
PADDLE_ENFORCE(
cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
#include <cuda_runtime.h>
#include <stddef.h> #include <stddef.h>
namespace paddle { namespace paddle {
...@@ -31,7 +32,7 @@ int GetCurrentDeviceId(); ...@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
void SetDeviceId(int device_id); void SetDeviceId(int device_id);
//!Get the memory usage of current GPU device. //!Get the memory usage of current GPU device.
void GpuMemoryUsage(size_t& available, size_t& total); void GpuMemoryUsage(size_t &available, size_t &total);
//! Get the maximum allocation size of current GPU device. //! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize(); size_t GpuMaxAllocSize();
...@@ -42,6 +43,18 @@ size_t GpuMinChunkSize(); ...@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator. //! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize(); size_t GpuMaxChunkSize();
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册